Lab 1

install.packages("C50",repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/FrankLin/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## package 'C50' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'C50'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\FrankLin\Documents\R\win-library\3.6\00LOCK\C50\libs\x64\C50.dll
## to C:\Users\FrankLin\Documents\R\win-library\3.6\C50\libs\x64\C50.dll:
## Permission denied

## Warning: restored 'C50'

## 
## The downloaded binary packages are in
##  C:\Users\FrankLin\AppData\Local\Temp\Rtmp2vdYWf\downloaded_packages

install.packages("randomForest",repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/FrankLin/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## package 'randomForest' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'randomForest'

## Warning in file.copy(savedcopy, lib, recursive = TRUE):
## problem copying C:\Users\FrankLin\Documents\R\win-
## library\3.6\00LOCK\randomForest\libs\x64\randomForest.dll
## to C:\Users\FrankLin\Documents\R\win-
## library\3.6\randomForest\libs\x64\randomForest.dll: Permission denied

## Warning: restored 'randomForest'

## 
## The downloaded binary packages are in
##  C:\Users\FrankLin\AppData\Local\Temp\Rtmp2vdYWf\downloaded_packages

install.packages("rpart",repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/FrankLin/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## package 'rpart' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\FrankLin\AppData\Local\Temp\Rtmp2vdYWf\downloaded_packages

install.packages("caTools",repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/FrankLin/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## package 'caTools' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'caTools'

## Warning in file.copy(savedcopy, lib, recursive = TRUE):
## problem copying C:\Users\FrankLin\Documents\R\win-
## library\3.6\00LOCK\caTools\libs\x64\caTools.dll to C:
## \Users\FrankLin\Documents\R\win-library\3.6\caTools\libs\x64\caTools.dll:
## Permission denied

## Warning: restored 'caTools'

## 
## The downloaded binary packages are in
##  C:\Users\FrankLin\AppData\Local\Temp\Rtmp2vdYWf\downloaded_packages

install.packages("Metrics",repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/FrankLin/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## package 'Metrics' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\FrankLin\AppData\Local\Temp\Rtmp2vdYWf\downloaded_packages

install.packages("rpart.plot",repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/FrankLin/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## package 'rpart.plot' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\FrankLin\AppData\Local\Temp\Rtmp2vdYWf\downloaded_packages

library(moments)
##Step 1, Collecting the data##
credit = read.csv("/Users/FrankLin/Desktop/530/credit.csv", header = TRUE)
str(credit)

## 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...

##Step 2, Exploring the data##
summary(credit$Credit.Amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

table(credit$Creditability)

## 
##   0   1 
## 300 700

set.seed(23458)
credit_rand <- credit[order(runif(1000)), ]
summary(credit$Credit.Amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
prop.table(table(credit_train$ Creditability))

## 
##         0         1 
## 0.2988889 0.7011111

prop.table(table(credit_test$Creditability))

## 
##    0    1 
## 0.31 0.69

##Step 3: Training a model on the data##
library(C50)

## Warning: package 'C50' was built under R version 3.6.3

credit_train$Creditability<-as.factor(credit_train$Creditability)
str(credit_train$Creditability)

##  Factor w/ 2 levels "0","1": 2 2 2 2 1 2 2 2 2 1 ...

credit_model <- C5.0(x = credit_train[-1], credit_train$Creditability)
summary(credit_model)

## 
## Call:
## C5.0.default(x = credit_train[-1], y = credit_train$Creditability)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Thu Jan 07 19:29:59 2021
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 900 cases (21 attributes) from undefined.data
## 
## Decision tree:
## 
## Account.Balance > 2:
## :...Concurrent.Credits <= 2:
## :   :...Purpose <= 0:
## :   :   :...Duration.of.Credit..month. <= 15: 1 (4)
## :   :   :   Duration.of.Credit..month. > 15: 0 (9/1)
## :   :   Purpose > 0:
## :   :   :...Purpose <= 4: 1 (32/2)
## :   :       Purpose > 4:
## :   :       :...Length.of.current.employment > 3: 1 (7/1)
## :   :           Length.of.current.employment <= 3:
## :   :           :...Instalment.per.cent <= 1: 1 (2)
## :   :               Instalment.per.cent > 1: 0 (8/1)
## :   Concurrent.Credits > 2:
## :   :...Duration.of.Credit..month. <= 33: 1 (307/29)
## :       Duration.of.Credit..month. > 33:
## :       :...Length.of.current.employment > 3: 1 (24)
## :           Length.of.current.employment <= 3:
## :           :...Type.of.apartment <= 1: 0 (4)
## :               Type.of.apartment > 1:
## :               :...Duration.in.Current.address > 3: 1 (7)
## :                   Duration.in.Current.address <= 3:
## :                   :...Payment.Status.of.Previous.Credit <= 1: 1 (3)
## :                       Payment.Status.of.Previous.Credit > 1: 0 (7/1)
## Account.Balance <= 2:
## :...Payment.Status.of.Previous.Credit <= 1:
##     :...Guarantors > 1: 1 (5/1)
##     :   Guarantors <= 1:
##     :   :...Type.of.apartment <= 1: 0 (13)
##     :       Type.of.apartment > 1:
##     :       :...Type.of.apartment > 2: 0 (11/1)
##     :           Type.of.apartment <= 2:
##     :           :...Sex...Marital.Status > 3: 1 (2)
##     :               Sex...Marital.Status <= 3:
##     :               :...Value.Savings.Stocks > 2:
##     :                   :...Credit.Amount <= 2064: 0 (2)
##     :                   :   Credit.Amount > 2064: 1 (7)
##     :                   Value.Savings.Stocks <= 2:
##     :                   :...Purpose <= 6: 0 (13/1)
##     :                       Purpose > 6:
##     :                       :...Credit.Amount <= 4221: 1 (4)
##     :                           Credit.Amount > 4221: 0 (4)
##     Payment.Status.of.Previous.Credit > 1:
##     :...Duration.of.Credit..month. > 22:
##         :...Value.Savings.Stocks > 3:
##         :   :...Account.Balance > 1:
##         :   :   :...Credit.Amount <= 9629: 1 (15)
##         :   :   :   Credit.Amount > 9629: 0 (2)
##         :   :   Account.Balance <= 1:
##         :   :   :...Most.valuable.available.asset > 2: 0 (7/1)
##         :   :       Most.valuable.available.asset <= 2:
##         :   :       :...Purpose <= 0: 0 (2)
##         :   :           Purpose > 0: 1 (5)
##         :   Value.Savings.Stocks <= 3:
##         :   :...Duration.of.Credit..month. > 47:
##         :       :...Occupation <= 3: 0 (17/1)
##         :       :   Occupation > 3:
##         :       :   :...Purpose <= 4: 0 (4)
##         :       :       Purpose > 4: 1 (2)
##         :       Duration.of.Credit..month. <= 47:
##         :       :...Length.of.current.employment > 3:
##         :           :...Value.Savings.Stocks > 1: 1 (11/1)
##         :           :   Value.Savings.Stocks <= 1:
##         :           :   :...No.of.dependents > 1:
##         :           :       :...Duration.in.Current.address <= 2: 0 (3/1)
##         :           :       :   Duration.in.Current.address > 2: 1 (7)
##         :           :       No.of.dependents <= 1:
##         :           :       :...Purpose <= 2: 0 (16/3)
##         :           :           Purpose > 2:
##         :           :           :...Instalment.per.cent <= 3: 0 (3/1)
##         :           :               Instalment.per.cent > 3: 1 (7)
##         :           Length.of.current.employment <= 3:
##         :           :...No.of.dependents > 1: 0 (9/1)
##         :               No.of.dependents <= 1:
##         :               :...Length.of.current.employment <= 1:
##         :                   :...Sex...Marital.Status <= 3: 1 (8/1)
##         :                   :   Sex...Marital.Status > 3: 0 (2)
##         :                   Length.of.current.employment > 1:
##         :                   :...Duration.in.Current.address <= 2:
##         :                       :...Value.Savings.Stocks > 1: 0 (5/1)
##         :                       :   Value.Savings.Stocks <= 1:
##         :                       :   :...Concurrent.Credits <= 1: 0 (5/1)
##         :                       :       Concurrent.Credits > 1: [S1]
##         :                       Duration.in.Current.address > 2:
##         :                       :...Type.of.apartment <= 1: 0 (9)
##         :                           Type.of.apartment > 1:
##         :                           :...Concurrent.Credits <= 2: 0 (5)
##         :                               Concurrent.Credits > 2:
##         :                               :...Credit.Amount <= 3448: 0 (7)
##         :                                   Credit.Amount > 3448:
##         :                                   :...Account.Balance > 1: 1 (3)
##         :                                       Account.Balance <= 1: [S2]
##         Duration.of.Credit..month. <= 22:
##         :...Guarantors > 2:
##             :...Purpose <= 1: 0 (3)
##             :   Purpose > 1: 1 (24)
##             Guarantors <= 2:
##             :...Payment.Status.of.Previous.Credit > 2:
##                 :...Guarantors <= 1: 1 (78/13)
##                 :   Guarantors > 1:
##                 :   :...Credit.Amount <= 2255: 1 (2)
##                 :       Credit.Amount > 2255: 0 (2)
##                 Payment.Status.of.Previous.Credit <= 2:
##                 :...No.of.Credits.at.this.Bank > 1:
##                     :...Sex...Marital.Status <= 2: 0 (6)
##                     :   Sex...Marital.Status > 2: 1 (6/1)
##                     No.of.Credits.at.this.Bank <= 1:
##                     :...Purpose > 8: 1 (10)
##                         Purpose <= 8:
##                         :...Credit.Amount > 7485: 0 (6)
##                             Credit.Amount <= 7485:
##                             :...Credit.Amount > 1372:
##                                 :...Concurrent.Credits <= 2: 1 (4)
##                                 :   Concurrent.Credits > 2:
##                                 :   :...Telephone > 1: [S3]
##                                 :       Telephone <= 1: [S4]
##                                 Credit.Amount <= 1372:
##                                 :...Telephone > 1: [S5]
##                                     Telephone <= 1:
##                                     :...Concurrent.Credits <= 2: 0 (5)
##                                         Concurrent.Credits > 2:
##                                         :...Account.Balance > 1: [S6]
##                                             Account.Balance <= 1: [S7]
## 
## SubTree [S1]
## 
## Most.valuable.available.asset <= 3: 1 (17/5)
## Most.valuable.available.asset > 3: 0 (2)
## 
## SubTree [S2]
## 
## Duration.of.Credit..month. <= 30: 1 (2)
## Duration.of.Credit..month. > 30: 0 (2)
## 
## SubTree [S3]
## 
## Duration.in.Current.address <= 2: 0 (7/2)
## Duration.in.Current.address > 2: 1 (9)
## 
## SubTree [S4]
## 
## Duration.of.Credit..month. <= 16: 1 (27/2)
## Duration.of.Credit..month. > 16:
## :...Credit.Amount <= 2528: 0 (4)
##     Credit.Amount > 2528: 1 (7/1)
## 
## SubTree [S5]
## 
## Length.of.current.employment <= 2: 0 (3/1)
## Length.of.current.employment > 2: 1 (6)
## 
## SubTree [S6]
## 
## Sex...Marital.Status > 3: 1 (3)
## Sex...Marital.Status <= 3:
## :...Duration.in.Current.address <= 2: 0 (4)
##     Duration.in.Current.address > 2: 1 (9/2)
## 
## SubTree [S7]
## 
## Most.valuable.available.asset > 1: 0 (13/1)
## Most.valuable.available.asset <= 1:
## :...Age..years. <= 26: 0 (3)
##     Age..years. > 26:
##     :...Occupation > 2: 1 (3)
##         Occupation <= 2:
##         :...Sex...Marital.Status <= 3: 0 (2)
##             Sex...Marital.Status > 3: 1 (2)
## 
## 
## Evaluation on training data (900 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      74   78( 8.7%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     210    59    (a): class 0
##      19   612    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% Account.Balance
##   87.78% Duration.of.Credit..month.
##   62.11% Concurrent.Credits
##   55.11% Payment.Status.of.Previous.Credit
##   34.33% Guarantors
##   30.67% Purpose
##   23.00% Value.Savings.Stocks
##   21.56% Length.of.current.employment
##   18.78% Credit.Amount
##   15.44% No.of.Credits.at.this.Bank
##   12.56% Duration.in.Current.address
##   12.44% No.of.dependents
##   11.89% Telephone
##   11.67% Type.of.apartment
##    8.22% Sex...Marital.Status
##    6.22% Most.valuable.available.asset
##    3.33% Occupation
##    2.22% Instalment.per.cent
##    1.11% Age..years.
## 
## 
## Time: 0.0 secs

##Step 4: Evaluating Model Performance
cred_pred <- predict(credit_model, credit_test)
library(gmodels)

## Warning: package 'gmodels' was built under R version 3.6.3

CrossTable(credit_test$Creditability, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c( 'Actual Creditability', 'Predicted Creditability'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                      | Predicted Creditability 
## Actual Creditability |         0 |         1 | Row Total | 
## ---------------------|-----------|-----------|-----------|
##                    0 |        15 |        16 |        31 | 
##                      |     0.150 |     0.160 |           | 
## ---------------------|-----------|-----------|-----------|
##                    1 |        14 |        55 |        69 | 
##                      |     0.140 |     0.550 |           | 
## ---------------------|-----------|-----------|-----------|
##         Column Total |        29 |        71 |       100 | 
## ---------------------|-----------|-----------|-----------|
## 
##

Q1- If you see an accuracy of 100%, what does it mean? Does this mean that we design a perfect model? This is some thing that needs more discussion. Write a few sentences about accuracy of 100%.

Answer: I think something is wrong in my medol if the accuracy is 100%. No, it’s not perfect model. Probably the test set is very similiar to the training set.

##Method2 Random forest 
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

credit_train$Creditability <- as.factor(credit_train$Creditability)
random_model <- randomForest(Creditability ~ . , data= credit_train) 
summary(random_model)

##                 Length Class  Mode     
## call               3   -none- call     
## type               1   -none- character
## predicted        900   factor numeric  
## err.rate        1500   -none- numeric  
## confusion          6   -none- numeric  
## votes           1800   matrix numeric  
## oob.times        900   -none- numeric  
## classes            2   -none- character
## importance        20   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            14   -none- list     
## y                900   factor numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call

cred_pred <- predict(random_model, credit_test) 
(p <- table(cred_pred, credit_test$Creditability))

##          
## cred_pred  0  1
##         0 11  5
##         1 20 64

(Accuracy <- sum(diag(p))/sum(p)*100)

## [1] 75

##Q2- What are the three most important features in this model? ## Answer: Credit.Amount; Account.Balance;Age..years. ## If change seed to 23458, the accuracy is 75%

importance(random_model)

##                                   MeanDecreaseGini
## Account.Balance                          39.457314
## Duration.of.Credit..month.               38.019433
## Payment.Status.of.Previous.Credit        22.546142
## Purpose                                  23.951809
## Credit.Amount                            50.541383
## Value.Savings.Stocks                     17.205724
## Length.of.current.employment             20.627185
## Instalment.per.cent                      15.719838
## Sex...Marital.Status                     13.367657
## Guarantors                                7.016187
## Duration.in.Current.address              15.719091
## Most.valuable.available.asset            16.545241
## Age..years.                              38.381132
## Concurrent.Credits                        9.531617
## Type.of.apartment                        10.301770
## No.of.Credits.at.this.Bank                7.946921
## Occupation                               12.310681
## No.of.dependents                          5.217456
## Telephone                                 7.157283
## Foreign.Worker                            1.465606

#Method #3. Adding regression to trees
#Step 1: Collecting the Data
wine = read.csv("/Users/FrankLin/Desktop/530/whitewines.csv", header = TRUE)
str(wine)

## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
##  $ volatile.acidity    : num  0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
##  $ citric.acid         : num  0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
##  $ residual.sugar      : num  1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
##  $ chlorides           : num  0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
##  $ free.sulfur.dioxide : num  6 41 33 11 36 22 33 17 34 40 ...
##  $ total.sulfur.dioxide: num  62 113 123 74 119 95 152 67 90 130 ...
##  $ density             : num  0.993 0.999 0.995 0.991 0.993 ...
##  $ pH                  : num  3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
##  $ sulphates           : num  0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
##  $ alcohol             : num  10.4 8.9 10.1 11.2 10.9 ...
##  $ quality             : int  5 6 6 4 6 6 6 6 6 7 ...

hist(wine$quality)

#Step2:Exploring and Preparing the Data
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]

#Step 3: Training a Model on the Data
library(rpart)

## Warning: package 'rpart' was built under R version 3.6.3

m.rpart <- rpart(quality ~ ., data=wine_train)
m.rpart

## n= 3750 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3750 2945.53200 5.870933  
##    2) alcohol< 10.85 2372 1418.86100 5.604975  
##      4) volatile.acidity>=0.2275 1611  821.30730 5.432030  
##        8) volatile.acidity>=0.3025 688  278.97670 5.255814 *
##        9) volatile.acidity< 0.3025 923  505.04230 5.563380 *
##      5) volatile.acidity< 0.2275 761  447.36400 5.971091 *
##    3) alcohol>=10.85 1378 1070.08200 6.328737  
##      6) free.sulfur.dioxide< 10.5 84   95.55952 5.369048 *
##      7) free.sulfur.dioxide>=10.5 1294  892.13600 6.391036  
##       14) alcohol< 11.76667 629  430.11130 6.173291  
##         28) volatile.acidity>=0.465 11   10.72727 4.545455 *
##         29) volatile.acidity< 0.465 618  389.71680 6.202265 *
##       15) alcohol>=11.76667 665  403.99400 6.596992 *

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 3.6.3

rpart.plot(m.rpart, digits=3)

rpart.plot(m.rpart, digits=4, fallen.leaves = TRUE, type = 3, extra = 101)

#Step 4: Evaluating Model Performance
p.rpart = predict(m.rpart, wine_test)
summary(p.rpart)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.545   5.563   5.971   5.893   6.202   6.597

summary(wine_test$quality)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.901   6.000   9.000

cor(p.rpart, wine_test$quality)

## [1] 0.5369525

Q3:What is your interpretation about this amount of RMSE?

Answer:The RMSE is 0.74, see below

library(Metrics)

## Warning: package 'Metrics' was built under R version 3.6.3

rmse(wine_test$quality, p.rpart)

## [1] 0.7448093

#Method 4. News Popularity
#Step 1: Collecting the Data
news = read.csv("/Users/FrankLin/Desktop/530/OnlineNewsPopularity_for_R.csv", header = TRUE)
str(news)

## 'data.frame':    39644 obs. of  61 variables:
##  $ url                          : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ timedelta                    : num  731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : num  219 255 211 531 1072 ...
##  $ n_unique_tokens              : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : num  2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: num  1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : num  0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : num  0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num  496 0 918 0 3151 ...
##  $ weekday_is_monday            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num  0.188 0 0 0 0.136 ...
##  $ shares                       : int  593 711 1500 1200 505 855 556 891 3600 710 ...

#Step 2:Pre-processing
library(caTools)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

newsShort = 
  news %>% 
  select(starts_with("n_"),starts_with("num_"),average_token_length,kw_max_max,global_sentiment_polarity,avg_positive_polarity,starts_with("title"),starts_with("abs"),shares,-n_non_stop_unique_tokens,-num_self_hrefs) %>% 
  mutate(shares = as.numeric(shares),
         shares = case_when(shares >=1400 ~ "yes",
                             TRUE ~ "no"),
         shares = as.factor(shares))

set.seed(12345)
news_split = sample.split(newsShort$shares,SplitRatio = 0.9)
news_train = newsShort[news_split == TRUE,]
news_test = newsShort[news_split == FALSE,]

prop.table(table(news_train$shares))

## 
##        no       yes 
## 0.4663957 0.5336043

prop.table(table(news_test$shares))

## 
##       no      yes 
## 0.466448 0.533552

## Check model
colnames(news_train)

##  [1] "n_tokens_title"               "n_tokens_content"            
##  [3] "n_unique_tokens"              "n_non_stop_words"            
##  [5] "num_hrefs"                    "num_imgs"                    
##  [7] "num_videos"                   "num_keywords"                
##  [9] "average_token_length"         "kw_max_max"                  
## [11] "global_sentiment_polarity"    "avg_positive_polarity"       
## [13] "title_subjectivity"           "title_sentiment_polarity"    
## [15] "abs_title_subjectivity"       "abs_title_sentiment_polarity"
## [17] "shares"

library(C50)
news_model = C5.0(news_train[-17],news_train$shares)
summary((news_model))

## 
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Thu Jan 07 19:30:08 2021
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 35680 cases (17 attributes) from undefined.data
## 
## Decision tree:
## 
## num_imgs > 3:
## :...kw_max_max <= 15000:
## :   :...n_tokens_content <= 1206: no (25/4)
## :   :   n_tokens_content > 1206: yes (5)
## :   kw_max_max > 15000:
## :   :...num_keywords <= 5:
## :       :...num_videos > 2:
## :       :   :...abs_title_sentiment_polarity <= 0.1305556: no (49/11)
## :       :   :   abs_title_sentiment_polarity > 0.1305556:
## :       :   :   :...average_token_length <= 4.347826: no (5)
## :       :   :       average_token_length > 4.347826:
## :       :   :       :...num_hrefs > 12: yes (13/1)
## :       :   :           num_hrefs <= 12:
## :       :   :           :...title_subjectivity <= 0.5190476: no (7/1)
## :       :   :               title_subjectivity > 0.5190476: yes (2)
## :       :   num_videos <= 2:
## :       :   :...n_tokens_title <= 9:
## :       :       :...num_videos > 0: yes (78/20)
## :       :       :   num_videos <= 0:
## :       :       :   :...n_tokens_title > 7: yes (206/77)
## :       :       :       n_tokens_title <= 7:
## :       :       :       :...num_keywords <= 4: yes (28/11)
## :       :       :           num_keywords > 4: no (52/20)
## :       :       n_tokens_title > 9:
## :       :       :...num_imgs > 30:
## :       :           :...num_videos <= 0: no (74/25)
## :       :           :   num_videos > 0:
## :       :           :   :...global_sentiment_polarity <= 0.1016769: no (18)
## :       :           :       global_sentiment_polarity > 0.1016769: yes (16/5)
## :       :           num_imgs <= 30:
## :       :           :...kw_max_max <= 690400: yes (101/37)
## :       :               kw_max_max > 690400:
## :       :               :...num_keywords > 4: yes (356/168)
## :       :                   num_keywords <= 4:
## :       :                   :...num_videos <= 0: no (127/46)
## :       :                       num_videos > 0: yes (45/19)
## :       num_keywords > 5:
## :       :...n_tokens_content > 2951: yes (68/7)
## :           n_tokens_content <= 2951:
## :           :...num_hrefs > 11:
## :               :...n_tokens_title > 9:
## :               :   :...n_tokens_content <= 319: yes (634/180)
## :               :   :   n_tokens_content > 319:
## :               :   :   :...n_tokens_title <= 13: yes (2073/781)
## :               :   :       n_tokens_title > 13:
## :               :   :       :...num_keywords <= 9:
## :               :   :           :...abs_title_subjectivity <= 0.325: yes (63/27)
## :               :   :           :   abs_title_subjectivity > 0.325: no (70/23)
## :               :   :           num_keywords > 9:
## :               :   :           :...title_sentiment_polarity <= 0.452381: yes (49/16)
## :               :   :               title_sentiment_polarity > 0.452381: no (3)
## :               :   n_tokens_title <= 9:
## :               :   :...num_keywords > 6: yes (1640/461)
## :               :       num_keywords <= 6:
## :               :       :...n_tokens_content > 2216:
## :               :           :...global_sentiment_polarity <= 0.2137865: no (14/1)
## :               :           :   global_sentiment_polarity > 0.2137865: yes (3)
## :               :           n_tokens_content <= 2216:
## :               :           :...n_tokens_title <= 7:
## :               :               :...n_tokens_title <= 6:
## :               :               :   :...average_token_length <= 4.889474: yes (19/2)
## :               :               :   :   average_token_length > 4.889474: no (3)
## :               :               :   n_tokens_title > 6:
## :               :               :   :...avg_positive_polarity <= 0.3149621: no (3)
## :               :               :       avg_positive_polarity > 0.3149621: yes (43/9)
## :               :               n_tokens_title > 7:
## :               :               :...num_hrefs <= 27:
## :               :                   :...average_token_length <= 5.397143: yes (136/49)
## :               :                   :   average_token_length > 5.397143: no (7/1)
## :               :                   num_hrefs > 27:
## :               :                   :...avg_positive_polarity <= 0.4811688: yes (36/5)
## :               :                       avg_positive_polarity > 0.4811688: no (6/1)
## :               num_hrefs <= 11:
## :               :...num_videos > 1: yes (286/94)
## :                   num_videos <= 1:
## :                   :...n_tokens_content <= 373:
## :                       :...title_subjectivity <= 0.6822727:
## :                       :   :...num_imgs <= 41: yes (833/320)
## :                       :   :   num_imgs > 41:
## :                       :   :   :...title_subjectivity <= 0.2416667: yes (3)
## :                       :   :       title_subjectivity > 0.2416667: no (9)
## :                       :   title_subjectivity > 0.6822727:
## :                       :   :...kw_max_max > 690400: yes (155/35)
## :                       :       kw_max_max <= 690400:
## :                       :       :...title_subjectivity <= 0.875: yes (7)
## :                       :           title_subjectivity > 0.875: [S1]
## :                       n_tokens_content > 373:
## :                       :...average_token_length > 4.948864:
## :                           :...title_subjectivity > 0.44375: yes (59/26)
## :                           :   title_subjectivity <= 0.44375:
## :                           :   :...num_videos <= 0: no (101/30)
## :                           :       num_videos > 0:
## :                           :       :...avg_positive_polarity <= 0.4021769: no (33/8)
## :                           :           avg_positive_polarity > 0.4021769: yes (7/1)
## :                           average_token_length <= 4.948864:
## :                           :...kw_max_max <= 617900: yes (252/88)
## :                               kw_max_max > 617900:
## :                               :...n_tokens_title > 8:
## :                                   :...num_videos > 0:
## :                                   :   :...num_hrefs <= 8: yes (300/114)
## :                                   :   :   num_hrefs > 8: no (93/40)
## :                                   :   num_videos <= 0:
## :                                   :   :...num_hrefs > 6: yes (413/169)
## :                                   :       num_hrefs <= 6:
## :                                   :       :...n_tokens_title <= 9: yes (91/40)
## :                                   :           n_tokens_title > 9: no (404/182)
## :                                   n_tokens_title <= 8:
## :                                   :...kw_max_max > 690400: yes (182/57)
## :                                       kw_max_max <= 690400:
## :                                       :...n_tokens_content <= 473: no (10)
## :                                           n_tokens_content > 473:
## :                                           :...num_hrefs <= 4: yes (9)
## :                                               num_hrefs > 4: [S2]
## num_imgs <= 3:
## :...n_unique_tokens <= 0.4305239:
##     :...kw_max_max <= 663600: yes (248/55)
##     :   kw_max_max > 663600:
##     :   :...global_sentiment_polarity > 0.1322018:
##     :       :...num_videos > 8: no (33/15)
##     :       :   num_videos <= 8:
##     :       :   :...num_hrefs > 7: yes (569/131)
##     :       :       num_hrefs <= 7:
##     :       :       :...num_imgs <= 1: yes (122/44)
##     :       :           num_imgs > 1:
##     :       :           :...title_sentiment_polarity > 0.2071429: no (6)
##     :       :               title_sentiment_polarity <= 0.2071429:
##     :       :               :...n_tokens_content > 1220: no (15/3)
##     :       :                   n_tokens_content <= 1220:
##     :       :                   :...average_token_length <= 4.381201: no (7/2)
##     :       :                       average_token_length > 4.381201: yes (16/1)
##     :       global_sentiment_polarity <= 0.1322018:
##     :       :...num_hrefs > 32:
##     :           :...average_token_length <= 5.10728: yes (49/8)
##     :           :   average_token_length > 5.10728: no (3)
##     :           num_hrefs <= 32:
##     :           :...num_videos > 6: no (30/9)
##     :               num_videos <= 6:
##     :               :...average_token_length <= 4.709486: yes (1242/520)
##     :                   average_token_length > 4.709486:
##     :                   :...num_hrefs > 16:
##     :                       :...global_sentiment_polarity > 0.06642009: yes (54/13)
##     :                       :   global_sentiment_polarity <= 0.06642009:
##     :                       :   :...global_sentiment_polarity <= 0.01395874: yes (10/2)
##     :                       :       global_sentiment_polarity > 0.01395874: no (23/4)
##     :                       num_hrefs <= 16:
##     :                       :...n_unique_tokens <= 0.3680473:
##     :                           :...avg_positive_polarity <= 0.2991207: no (6/1)
##     :                           :   avg_positive_polarity > 0.2991207: yes (17/2)
##     :                           n_unique_tokens > 0.3680473:
##     :                           :...num_videos <= 1:
##     :                               :...n_unique_tokens <= 0.42898: no (153/33)
##     :                               :   n_unique_tokens > 0.42898: yes (8/2)
##     :                               num_videos > 1:
##     :                               :...num_keywords <= 5: no (5)
##     :                                   num_keywords > 5: yes (15/4)
##     n_unique_tokens > 0.4305239:
##     :...kw_max_max <= 617900:
##         :...kw_max_max > 69100: yes (1766/638)
##         :   kw_max_max <= 69100:
##         :   :...num_hrefs > 4: no (1103/546)
##         :       num_hrefs <= 4:
##         :       :...kw_max_max <= 37400:
##         :           :...title_subjectivity <= 0.725: no (86/32)
##         :           :   title_subjectivity > 0.725: yes (14/1)
##         :           kw_max_max > 37400:
##         :           :...num_imgs > 0: yes (313/135)
##         :               num_imgs <= 0:
##         :               :...num_keywords <= 5:
##         :                   :...num_keywords <= 4: yes (5/1)
##         :                   :   num_keywords > 4:
##         :                   :   :...average_token_length <= 4.465278: yes (8/2)
##         :                   :       average_token_length > 4.465278: no (14/2)
##         :                   num_keywords > 5:
##         :                   :...n_non_stop_words > 0.9999999: yes (213/69)
##         :                       n_non_stop_words <= 0.9999999:
##         :                       :...average_token_length <= 4.954774: yes (8)
##         :                           average_token_length > 4.954774: no (5)
##         kw_max_max > 617900:
##         :...num_hrefs > 21:
##             :...n_tokens_title > 9:
##             :   :...title_subjectivity > 0.975: yes (53/13)
##             :   :   title_subjectivity <= 0.975:
##             :   :   :...num_imgs <= 0: yes (137/53)
##             :   :       num_imgs > 0: no (559/274)
##             :   n_tokens_title <= 9:
##             :   :...kw_max_max <= 690400: yes (83/15)
##             :       kw_max_max > 690400:
##             :       :...global_sentiment_polarity > 0.06215278: yes (275/76)
##             :           global_sentiment_polarity <= 0.06215278:
##             :           :...num_imgs > 1: no (15/3)
##             :               num_imgs <= 1:
##             :               :...average_token_length > 5.667984: no (4)
##             :                   average_token_length <= 5.667984:
##             :                   :...num_hrefs > 37: yes (8)
##             :                       num_hrefs <= 37:
##             :                       :...n_tokens_title <= 7: yes (9/3)
##             :                           n_tokens_title > 7:
##             :                           :...title_subjectivity <= 0.775: no (29/8)
##             :                               title_subjectivity > 0.775: yes (2)
##             num_hrefs <= 21:
##             :...global_sentiment_polarity <= 0.07039773:
##                 :...num_imgs <= 0:
##                 :   :...num_videos > 25:
##                 :   :   :...n_unique_tokens <= 0.5114286: no (42/6)
##                 :   :   :   n_unique_tokens > 0.5114286:
##                 :   :   :   :...num_hrefs <= 4: no (4)
##                 :   :   :       num_hrefs > 4: yes (11/2)
##                 :   :   num_videos <= 25:
##                 :   :   :...num_keywords <= 5: no (236/96)
##                 :   :       num_keywords > 5:
##                 :   :       :...n_tokens_title <= 10: no (381/177)
##                 :   :           n_tokens_title > 10:
##                 :   :           :...kw_max_max > 690400: yes (414/192)
##                 :   :               kw_max_max <= 690400:
##                 :   :               :...num_keywords <= 6: yes (12)
##                 :   :                   num_keywords > 6:
##                 :   :                   :...n_unique_tokens <= 0.6736842:
##                 :   :                       :...num_videos <= 0: yes (6/2)
##                 :   :                       :   num_videos > 0: no (17/4)
##                 :   :                       n_unique_tokens > 0.6736842: [S3]
##                 :   num_imgs > 0:
##                 :   :...n_unique_tokens > 0.7686567:
##                 :       :...title_sentiment_polarity <= -0.35: no (3)
##                 :       :   title_sentiment_polarity > -0.35: yes (43/11)
##                 :       n_unique_tokens <= 0.7686567:
##                 :       :...num_hrefs <= 1: no (163/31)
##                 :           num_hrefs > 1:
##                 :           :...average_token_length <= 4.805281:
##                 :               :...num_imgs <= 2:
##                 :               :   :...average_token_length <= 4.305466: yes (170/74)
##                 :               :   :   average_token_length > 4.305466: no (2187/876)
##                 :               :   num_imgs > 2:
##                 :               :   :...num_hrefs <= 2: no (18/4)
##                 :               :       num_hrefs > 2: [S4]
##                 :               average_token_length > 4.805281:
##                 :               :...num_hrefs > 20: yes (26/9)
##                 :                   num_hrefs <= 20:
##                 :                   :...kw_max_max <= 690400:
##                 :                       :...n_tokens_title > 9:
##                 :                       :   :...num_hrefs <= 5: yes (27/8)
##                 :                       :   :   num_hrefs > 5:
##                 :                       :   :   :...num_videos > 0: [S5]
##                 :                       :   :       num_videos <= 0: [S6]
##                 :                       :   n_tokens_title <= 9: [S7]
##                 :                       kw_max_max > 690400:
##                 :                       :...num_keywords <= 6: no (798/218)
##                 :                           num_keywords > 6:
##                 :                           :...num_imgs <= 1:
##                 :                               :...num_videos <= 1: no (646/205)
##                 :                               :   num_videos > 1: [S8]
##                 :                               num_imgs > 1: [S9]
##                 global_sentiment_polarity > 0.07039773:
##                 :...num_imgs > 1:
##                     :...num_keywords > 9: yes (296/97)
##                     :   num_keywords <= 9:
##                     :   :...n_tokens_content <= 579: yes (1266/522)
##                     :       n_tokens_content > 579:
##                     :       :...title_subjectivity > 0.47: no (142/45)
##                     :           title_subjectivity <= 0.47:
##                     :           :...global_sentiment_polarity <= 0.09938672: no (117/41)
##                     :               global_sentiment_polarity > 0.09938672:
##                     :               :...num_hrefs > 3: yes (238/86)
##                     :                   num_hrefs <= 3: [S10]
##                     num_imgs <= 1:
##                     :...num_videos <= 0:
##                         :...title_sentiment_polarity <= -0.006397306:
##                         :   :...n_unique_tokens > 0.71875: no (164/25)
##                         :   :   n_unique_tokens <= 0.71875:
##                         :   :   :...n_tokens_title > 12: no (133/38)
##                         :   :       n_tokens_title <= 12:
##                         :   :       :...num_imgs > 0:
##                         :   :           :...n_tokens_title <= 7: yes (29/11)
##                         :   :           :   n_tokens_title > 7: no (404/161)
##                         :   :           num_imgs <= 0:
##                         :   :           :...n_tokens_title > 10: yes (28/7)
##                         :   :               n_tokens_title <= 10:
##                         :   :               :...num_keywords <= 6: no (13/1)
##                         :   :                   num_keywords > 6: [S11]
##                         :   title_sentiment_polarity > -0.006397306:
##                         :   :...num_imgs <= 0:
##                         :       :...abs_title_subjectivity > 0.2188552: yes (464/190)
##                         :       :   abs_title_subjectivity <= 0.2188552:
##                         :       :   :...num_hrefs <= 12:
##                         :       :       :...average_token_length <= 4.844927: no (211/84)
##                         :       :       :   average_token_length > 4.844927:
##                         :       :       :   :...num_hrefs <= 7: yes (59/16)
##                         :       :       :       num_hrefs > 7: no (6/1)
##                         :       :       num_hrefs > 12: [S12]
##                         :       num_imgs > 0:
##                         :       :...n_tokens_content <= 684:
##                         :           :...n_tokens_content <= 121: yes (106/36)
##                         :           :   n_tokens_content > 121:
##                         :           :   :...kw_max_max > 690400:
##                         :           :       :...num_hrefs > 3: no (3098/1378)
##                         :           :       :   num_hrefs <= 3:
##                         :           :       :   :...num_keywords <= 9: no (519/181)
##                         :           :       :       num_keywords > 9: [S13]
##                         :           :       kw_max_max <= 690400:
##                         :           :       :...n_unique_tokens > 0.5162601: no (730/318)
##                         :           :           n_unique_tokens <= 0.5162601: [S14]
##                         :           n_tokens_content > 684:
##                         :           :...global_sentiment_polarity <= 0.1618714:
##                         :               :...kw_max_max <= 690400:
##                         :               :   :...n_unique_tokens > 0.515748: yes (9)
##                         :               :   :   n_unique_tokens <= 0.515748: [S15]
##                         :               :   kw_max_max > 690400:
##                         :               :   :...num_hrefs > 9: [S16]
##                         :               :       num_hrefs <= 9: [S17]
##                         :               global_sentiment_polarity > 0.1618714:
##                         :               :...kw_max_max <= 690400: [S18]
##                         :                   kw_max_max > 690400:
##                         :                   :...abs_title_subjectivity <= 0.19375:
##                         :                       :...num_keywords > 8: [S19]
##                         :                       :   num_keywords <= 8: [S20]
##                         :                       abs_title_subjectivity > 0.19375: [S21]
##                         num_videos > 0:
##                         :...num_videos > 3:
##                             :...kw_max_max <= 690400:
##                             :   :...num_hrefs <= 4: no (25/11)
##                             :   :   num_hrefs > 4: [S22]
##                             :   kw_max_max > 690400:
##                             :   :...num_imgs > 0:
##                             :       :...num_videos <= 11: yes (195/97)
##                             :       :   num_videos > 11:
##                             :       :   :...num_videos <= 19: yes (63/15)
##                             :       :       num_videos > 19: [S23]
##                             :       num_imgs <= 0:
##                             :       :...num_keywords <= 8: yes (220/72)
##                             :           num_keywords > 8:
##                             :           :...global_sentiment_polarity > 0.1930876:
##                             :               :...num_keywords > 9: yes (20/2)
##                             :               :   num_keywords <= 9: [S24]
##                             :               global_sentiment_polarity <= 0.1930876: [S25]
##                             num_videos <= 3:
##                             :...kw_max_max <= 690400:
##                                 :...num_imgs <= 0: yes (308/115)
##                                 :   num_imgs > 0:
##                                 :   :...average_token_length <= 4.377551: yes (19/1)
##                                 :       average_token_length > 4.377551:
##                                 :       :...num_hrefs > 9: yes (42/12)
##                                 :           num_hrefs <= 9:
##                                 :           :...num_keywords <= 6: yes (45/21)
##                                 :               num_keywords > 6: [S26]
##                                 kw_max_max > 690400:
##                                 :...num_imgs > 0:
##                                     :...num_hrefs <= 2: no (137/55)
##                                     :   num_hrefs > 2:
##                                     :   :...num_keywords > 6: yes (947/426)
##                                     :       num_keywords <= 6:
##                                     :       :...num_hrefs > 9: [S27]
##                                     :           num_hrefs <= 9:
##                                     :           :...num_videos > 1: no (75/36)
##                                     :               num_videos <= 1: [S28]
##                                     num_imgs <= 0:
##                                     :...n_tokens_title <= 9: no (343/150)
##                                         n_tokens_title > 9:
##                                         :...num_hrefs <= 4: yes (427/179)
##                                             num_hrefs > 4: [S29]
## 
## SubTree [S1]
## 
## title_sentiment_polarity <= -0.4196429: yes (4)
## title_sentiment_polarity > -0.4196429: no (19/6)
## 
## SubTree [S2]
## 
## title_subjectivity > 0.4469697: yes (3)
## title_subjectivity <= 0.4469697:
## :...num_keywords <= 7: no (12/2)
##     num_keywords > 7:
##     :...avg_positive_polarity <= 0.3463925: no (2)
##         avg_positive_polarity > 0.3463925: yes (4)
## 
## SubTree [S3]
## 
## title_sentiment_polarity <= -0.2666667: no (4/1)
## title_sentiment_polarity > -0.2666667: yes (14)
## 
## SubTree [S4]
## 
## title_sentiment_polarity > -0.06875: yes (105/35)
## title_sentiment_polarity <= -0.06875:
## :...num_hrefs <= 15: no (21/4)
##     num_hrefs > 15: yes (3)
## 
## SubTree [S5]
## 
## abs_title_sentiment_polarity <= 0.1277778: no (4)
## abs_title_sentiment_polarity > 0.1277778: yes (2)
## 
## SubTree [S6]
## 
## n_tokens_title <= 11: no (44/15)
## n_tokens_title > 11:
## :...abs_title_subjectivity <= 0.19: no (3)
##     abs_title_subjectivity > 0.19: yes (11/2)
## 
## SubTree [S7]
## 
## average_token_length > 5.098591: no (35/5)
## average_token_length <= 5.098591:
## :...title_sentiment_polarity <= -0.1185185: yes (7/1)
##     title_sentiment_polarity > -0.1185185:
##     :...num_hrefs <= 8: no (31/7)
##         num_hrefs > 8:
##         :...avg_positive_polarity <= 0.2806667: yes (8)
##             avg_positive_polarity > 0.2806667:
##             :...average_token_length <= 4.971723: no (9/1)
##                 average_token_length > 4.971723: yes (8/2)
## 
## SubTree [S8]
## 
## avg_positive_polarity <= 0.393142: no (33/10)
## avg_positive_polarity > 0.393142: yes (8/1)
## 
## SubTree [S9]
## 
## n_tokens_content > 280: no (192/64)
## n_tokens_content <= 280:
## :...num_imgs > 2: no (4/1)
##     num_imgs <= 2:
##     :...num_keywords > 8: yes (11)
##         num_keywords <= 8:
##         :...num_videos > 0: yes (4)
##             num_videos <= 0:
##             :...global_sentiment_polarity > 0.035639: no (6)
##                 global_sentiment_polarity <= 0.035639:
##                 :...average_token_length <= 4.830716: no (2)
##                     average_token_length > 4.830716: yes (7)
## 
## SubTree [S10]
## 
## abs_title_sentiment_polarity <= 0.05833333: no (26/5)
## abs_title_sentiment_polarity > 0.05833333:
## :...num_imgs > 2: yes (4)
##     num_imgs <= 2:
##     :...n_tokens_content <= 735: yes (4)
##         n_tokens_content > 735: no (5)
## 
## SubTree [S11]
## 
## n_tokens_title <= 8: yes (8)
## n_tokens_title > 8:
## :...num_hrefs <= 5: no (6)
##     num_hrefs > 5: yes (5/1)
## 
## SubTree [S12]
## 
## title_sentiment_polarity > 0.239899: no (28/2)
## title_sentiment_polarity <= 0.239899:
## :...num_hrefs <= 17: no (2)
##     num_hrefs > 17: yes (4)
## 
## SubTree [S13]
## 
## n_unique_tokens <= 0.6868132: yes (33/10)
## n_unique_tokens > 0.6868132: no (4)
## 
## SubTree [S14]
## 
## global_sentiment_polarity > 0.1258744: yes (101/27)
## global_sentiment_polarity <= 0.1258744:
## :...num_hrefs <= 8:
##     :...num_keywords <= 4: yes (2)
##     :   num_keywords > 4: no (26/5)
##     num_hrefs > 8:
##     :...n_unique_tokens <= 0.486924: yes (7)
##         n_unique_tokens > 0.486924:
##         :...n_unique_tokens <= 0.4950298: no (4)
##             n_unique_tokens > 0.4950298: yes (9/3)
## 
## SubTree [S15]
## 
## n_unique_tokens <= 0.5036573: yes (86/38)
## n_unique_tokens > 0.5036573: no (8)
## 
## SubTree [S16]
## 
## average_token_length <= 4.67347: yes (110/50)
## average_token_length > 4.67347: no (183/60)
## 
## SubTree [S17]
## 
## num_keywords > 8: yes (42/12)
## num_keywords <= 8:
## :...num_hrefs <= 7: no (85/36)
##     num_hrefs > 7:
##     :...avg_positive_polarity <= 0.2941558: no (4)
##         avg_positive_polarity > 0.2941558: yes (44/13)
## 
## SubTree [S18]
## 
## avg_positive_polarity > 0.421887: yes (14/1)
## avg_positive_polarity <= 0.421887:
## :...n_tokens_title <= 8: yes (16/5)
##     n_tokens_title > 8: no (16/5)
## 
## SubTree [S19]
## 
## num_hrefs <= 8: no (2)
## num_hrefs > 8: yes (14/1)
## 
## SubTree [S20]
## 
## average_token_length > 4.915493: no (15/1)
## average_token_length <= 4.915493:
## :...average_token_length <= 4.544326: no (7/1)
##     average_token_length > 4.544326: yes (20/4)
## 
## SubTree [S21]
## 
## avg_positive_polarity <= 0.4100406: yes (108/21)
## avg_positive_polarity > 0.4100406:
## :...avg_positive_polarity <= 0.418595: no (10/1)
##     avg_positive_polarity > 0.418595:
##     :...global_sentiment_polarity > 0.2345238: yes (13)
##         global_sentiment_polarity <= 0.2345238:
##         :...num_keywords <= 8: yes (23/5)
##             num_keywords > 8:
##             :...n_tokens_content <= 1020: no (13/2)
##                 n_tokens_content > 1020: yes (2)
## 
## SubTree [S22]
## 
## global_sentiment_polarity > 0.09930704: yes (70/10)
## global_sentiment_polarity <= 0.09930704:
## :...average_token_length <= 4.597379: no (7)
##     average_token_length > 4.597379: yes (7/1)
## 
## SubTree [S23]
## 
## avg_positive_polarity <= 0.5407143: no (9/1)
## avg_positive_polarity > 0.5407143: yes (2)
## 
## SubTree [S24]
## 
## average_token_length <= 4.452631: no (5)
## average_token_length > 4.452631: yes (12/2)
## 
## SubTree [S25]
## 
## title_subjectivity > 0.71875: no (9)
## title_subjectivity <= 0.71875:
## :...title_sentiment_polarity <= 0.1840909: no (28/9)
##     title_sentiment_polarity > 0.1840909:
##     :...average_token_length <= 4.408935: no (2)
##         average_token_length > 4.408935: yes (10)
## 
## SubTree [S26]
## 
## title_sentiment_polarity <= 0.4375: no (74/18)
## title_sentiment_polarity > 0.4375: yes (4)
## 
## SubTree [S27]
## 
## abs_title_sentiment_polarity <= 0.75: no (127/44)
## abs_title_sentiment_polarity > 0.75: yes (8/1)
## 
## SubTree [S28]
## 
## num_keywords <= 4: no (56/24)
## num_keywords > 4: yes (273/126)
## 
## SubTree [S29]
## 
## average_token_length > 4.990991: yes (28/7)
## average_token_length <= 4.990991:
## :...num_videos > 1:
##     :...avg_positive_polarity <= 0.2595328: yes (10/1)
##     :   avg_positive_polarity > 0.2595328: no (132/59)
##     num_videos <= 1:
##     :...num_hrefs > 8: no (88/29)
##         num_hrefs <= 8:
##         :...average_token_length <= 4.405406: no (66/21)
##             average_token_length > 4.405406:
##             :...average_token_length <= 4.456057: yes (10)
##                 average_token_length > 4.456057:
##                 :...num_hrefs > 7:
##                     :...abs_title_sentiment_polarity <= 0.04166667: yes (12)
##                     :   abs_title_sentiment_polarity > 0.04166667:
##                     :   :...n_tokens_title <= 11: no (16/5)
##                     :       n_tokens_title > 11: yes (7/1)
##                     num_hrefs <= 7:
##                     :...average_token_length <= 4.646048: no (56/19)
##                         average_token_length > 4.646048:
##                         :...title_subjectivity > 0.5166667: yes (9)
##                             title_subjectivity <= 0.5166667:
##                             :...avg_positive_polarity > 0.4453349: no (9/1)
##                                 avg_positive_polarity <= 0.4453349:
##                                 :...num_keywords <= 9: yes (29/7)
##                                     num_keywords > 9:
##                                     :...num_hrefs <= 5: yes (2)
##                                         num_hrefs > 5: no (4)
## 
## 
## Evaluation on training data (35680 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##     254 13129(36.8%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    9443  7198    (a): class no
##    5931 13108    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% num_imgs
##  100.00% kw_max_max
##   90.75% num_hrefs
##   73.74% n_unique_tokens
##   60.95% global_sentiment_polarity
##   55.50% num_videos
##   48.60% num_keywords
##   44.11% n_tokens_content
##   32.35% n_tokens_title
##   26.93% average_token_length
##   21.13% title_sentiment_polarity
##    7.79% title_subjectivity
##    3.22% abs_title_subjectivity
##    1.91% avg_positive_polarity
##    0.82% abs_title_sentiment_polarity
##    0.63% n_non_stop_words
## 
## 
## Time: 1.0 secs

## Step 3: Evaluation
news_pred <- predict(news_model, news_test)
(p <- table(news_pred, news_test$shares))

##          
## news_pred   no  yes
##       no   954  714
##       yes  895 1401

(Accuracy <- sum(diag(p))/sum(p)*100)

## [1] 59.40969

#Random Forest
library(randomForest)
random_model = randomForest(shares ~ .,data = news_train)
summary(random_model)

##                 Length Class  Mode     
## call                3  -none- call     
## type                1  -none- character
## predicted       35680  factor numeric  
## err.rate         1500  -none- numeric  
## confusion           6  -none- numeric  
## votes           71360  matrix numeric  
## oob.times       35680  -none- numeric  
## classes             2  -none- character
## importance         16  -none- numeric  
## importanceSD        0  -none- NULL     
## localImportance     0  -none- NULL     
## proximity           0  -none- NULL     
## ntree               1  -none- numeric  
## mtry                1  -none- numeric  
## forest             14  -none- list     
## y               35680  factor numeric  
## test                0  -none- NULL     
## inbag               0  -none- NULL     
## terms               3  terms  call

cred_pred = predict(random_model, news_test)
p = table(cred_pred, news_test$shares)

sum(diag(p))/sum(p)*100

## [1] 61.12513

Lab 1

Zihao Lin

1/6/2021

Q1- If you see an accuracy of 100%, what does it mean? Does this mean that we design a perfect model? This is some thing that needs more discussion. Write a few sentences about accuracy of 100%.

Answer: I think something is wrong in my medol if the accuracy is 100%. No, it’s not perfect model. Probably the test set is very similiar to the training set.

Q3:What is your interpretation about this amount of RMSE?

Answer:The RMSE is 0.74, see below