Method #1.Tree-based classification

Step 1: Collecting the data

library(readr)
credit <- read_csv("C:/Users/Gautam/OneDrive/HU/3rdsem/ANLY530/Assignment530/credit.csv")
## Parsed with column specification:
## cols(
##   .default = col_double()
## )
## See spec(...) for full column specifications.
View(credit)


wine <- read_csv("C:/Users/Gautam/OneDrive/HU/3rdsem/ANLY530/Assignment530/whitewines.csv")
## Parsed with column specification:
## cols(
##   `fixed acidity` = col_double(),
##   `volatile acidity` = col_double(),
##   `citric acid` = col_double(),
##   `residual sugar` = col_double(),
##   chlorides = col_double(),
##   `free sulfur dioxide` = col_double(),
##   `total sulfur dioxide` = col_double(),
##   density = col_double(),
##   pH = col_double(),
##   sulphates = col_double(),
##   alcohol = col_double(),
##   quality = col_double()
## )
View(wine)

Step 2: Exploring the data

str(credit)
## tibble [1,000 x 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Creditability                    : num [1:1000] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Account Balance                  : num [1:1000] 1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration of Credit (month)       : num [1:1000] 18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment Status of Previous Credit: num [1:1000] 4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : num [1:1000] 2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit Amount                    : num [1:1000] 1049 2799 841 2122 2171 ...
##  $ Value Savings/Stocks             : num [1:1000] 1 1 2 1 1 1 1 1 1 3 ...
##  $ Length of current employment     : num [1:1000] 2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment per cent              : num [1:1000] 4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex & Marital Status             : num [1:1000] 2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : num [1:1000] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration in Current address      : num [1:1000] 4 2 4 2 4 3 4 4 4 4 ...
##  $ Most valuable available asset    : num [1:1000] 2 1 1 1 2 1 1 1 3 4 ...
##  $ Age (years)                      : num [1:1000] 21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent Credits               : num [1:1000] 3 3 3 3 1 3 3 3 3 3 ...
##  $ Type of apartment                : num [1:1000] 1 1 1 1 2 1 2 2 2 1 ...
##  $ No of Credits at this Bank       : num [1:1000] 1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : num [1:1000] 3 3 2 2 2 2 2 2 1 1 ...
##  $ No of dependents                 : num [1:1000] 1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : num [1:1000] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign Worker                   : num [1:1000] 1 1 1 2 2 2 2 2 1 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Creditability = col_double(),
##   ..   `Account Balance` = col_double(),
##   ..   `Duration of Credit (month)` = col_double(),
##   ..   `Payment Status of Previous Credit` = col_double(),
##   ..   Purpose = col_double(),
##   ..   `Credit Amount` = col_double(),
##   ..   `Value Savings/Stocks` = col_double(),
##   ..   `Length of current employment` = col_double(),
##   ..   `Instalment per cent` = col_double(),
##   ..   `Sex & Marital Status` = col_double(),
##   ..   Guarantors = col_double(),
##   ..   `Duration in Current address` = col_double(),
##   ..   `Most valuable available asset` = col_double(),
##   ..   `Age (years)` = col_double(),
##   ..   `Concurrent Credits` = col_double(),
##   ..   `Type of apartment` = col_double(),
##   ..   `No of Credits at this Bank` = col_double(),
##   ..   Occupation = col_double(),
##   ..   `No of dependents` = col_double(),
##   ..   Telephone = col_double(),
##   ..   `Foreign Worker` = col_double()
##   .. )
str(wine)
## tibble [4,898 x 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ fixed acidity       : num [1:4898] 6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
##  $ volatile acidity    : num [1:4898] 0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
##  $ citric acid         : num [1:4898] 0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
##  $ residual sugar      : num [1:4898] 1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
##  $ chlorides           : num [1:4898] 0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
##  $ free sulfur dioxide : num [1:4898] 6 41 33 11 36 22 33 17 34 40 ...
##  $ total sulfur dioxide: num [1:4898] 62 113 123 74 119 95 152 67 90 130 ...
##  $ density             : num [1:4898] 0.993 0.999 0.995 0.991 0.993 ...
##  $ pH                  : num [1:4898] 3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
##  $ sulphates           : num [1:4898] 0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
##  $ alcohol             : num [1:4898] 10.4 8.9 10.1 11.2 10.9 ...
##  $ quality             : num [1:4898] 5 6 6 4 6 6 6 6 6 7 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `fixed acidity` = col_double(),
##   ..   `volatile acidity` = col_double(),
##   ..   `citric acid` = col_double(),
##   ..   `residual sugar` = col_double(),
##   ..   chlorides = col_double(),
##   ..   `free sulfur dioxide` = col_double(),
##   ..   `total sulfur dioxide` = col_double(),
##   ..   density = col_double(),
##   ..   pH = col_double(),
##   ..   sulphates = col_double(),
##   ..   alcohol = col_double(),
##   ..   quality = col_double()
##   .. )
names(credit) <- make.names(names(credit))
names(wine) <- make.names(names(wine))

summary(credit$Credit.Amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
credit$Creditability <- as.factor(credit$Creditability)
table(credit$Creditability)
## 
##   0   1 
## 300 700

Step 3: Training a model on the data

set.seed(12345) 
credit_rand <- credit[order(runif(1000)), ]
summary(credit$Credit.Amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
prop.table(table(credit_train$Creditability))
## 
##         0         1 
## 0.3088889 0.6911111
prop.table(table(credit_test$Creditability))
## 
##    0    1 
## 0.22 0.78
library(C50)
## Warning: package 'C50' was built under R version 4.0.3
credit_model <- C5.0(x = credit_train[-1], y = credit_train$Creditability)
credit_model
## 
## Call:
## C5.0.default(x = credit_train[-1], y = credit_train$Creditability)
## 
## Classification Tree
## Number of samples: 900 
## Number of predictors: 20 
## 
## Tree size: 85 
## 
## Non-standard options: attempt to group attributes

Step 4: Evaluating Model Performance

library(gmodels)
## Warning: package 'gmodels' was built under R version 4.0.3
cred_pred <- predict(credit_model, credit_test)
CrossTable(credit_test$Creditability, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability', 'Predicted Creditability'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                      | Predicted Creditability 
## Actual Creditability |         0 |         1 | Row Total | 
## ---------------------|-----------|-----------|-----------|
##                    0 |         8 |        14 |        22 | 
##                      |     0.080 |     0.140 |           | 
## ---------------------|-----------|-----------|-----------|
##                    1 |        17 |        61 |        78 | 
##                      |     0.170 |     0.610 |           | 
## ---------------------|-----------|-----------|-----------|
##         Column Total |        25 |        75 |       100 | 
## ---------------------|-----------|-----------|-----------|
## 
## 

Q1- If you see an accuracy of 100%, what does it mean? Does this mean that we design a perfect model? This is some thing that needs more discussion. Write a few sentences about accuracy of 100%.

100% Accuracy tells us that the test data is the exact match of the training data.The model is not perfect as it is only applicable to a particular kind of data(training).It will fail to perform in different conditions of the test data.

Method #2. Random forest

Step 1 : Model creation

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
random_model <- randomForest(Creditability ~ . , data= credit_train)
summary(random_model)
##                 Length Class  Mode     
## call               3   -none- call     
## type               1   -none- character
## predicted        900   factor numeric  
## err.rate        1500   -none- numeric  
## confusion          6   -none- numeric  
## votes           1800   matrix numeric  
## oob.times        900   -none- numeric  
## classes            2   -none- character
## importance        20   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            14   -none- list     
## y                900   factor numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call

Step 2 : Evaluating Model Performance

cred_pred <- predict(random_model, credit_test) 
p <- table(cred_pred, credit_test$Creditability)
Accuracy <- sum(diag(p))/sum(p)*100
Accuracy
## [1] 79

Q2- What are the three most important features in this model.

Credit Amount -50.86 , Account.Balance-40.11,Age.year -38.31

importance(random_model)
##                                   MeanDecreaseGini
## Account.Balance                          42.599355
## Duration.of.Credit..month.               37.502785
## Payment.Status.of.Previous.Credit        22.563009
## Purpose                                  23.774048
## Credit.Amount                            52.397155
## Value.Savings.Stocks                     19.388385
## Length.of.current.employment             20.221289
## Instalment.per.cent                      16.394636
## Sex...Marital.Status                     13.424449
## Guarantors                                7.475422
## Duration.in.Current.address              15.563685
## Most.valuable.available.asset            17.326842
## Age..years.                              37.377916
## Concurrent.Credits                        8.480725
## Type.of.apartment                         9.595344
## No.of.Credits.at.this.Bank                8.424006
## Occupation                               12.669816
## No.of.dependents                          5.774473
## Telephone                                 7.505291
## Foreign.Worker                            1.746964

Now, Change the random seed to 23458 and find the new accuracy of random forest. Accuracy is changed to 75%

set.seed(23458) 
credit_rand <- credit[order(runif(1000)), ]
summary(credit$Credit.Amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
random_model <- randomForest(Creditability ~ . , data= credit_train)
cred_pred <- predict(random_model, credit_test) 
p <- table(cred_pred, credit_test$Creditability)
Accuracy <- sum(diag(p))/sum(p)*100
Accuracy
## [1] 75

When I changed the seed value to 23458 ,the accuracy got decreased and it is 75% now.

Method #3. Adding regression to trees

Step 2: Exploring and Preparing the Data

hist(wine$quality)

wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]

Step 3: Training a Model on the Data

library(rpart)
## Warning: package 'rpart' was built under R version 4.0.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.3
m.rpart <- rpart(quality ~ ., data=wine_train)
m.rpart
## n= 3750 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3750 2945.53200 5.870933  
##    2) alcohol< 10.85 2372 1418.86100 5.604975  
##      4) volatile.acidity>=0.2275 1611  821.30730 5.432030  
##        8) volatile.acidity>=0.3025 688  278.97670 5.255814 *
##        9) volatile.acidity< 0.3025 923  505.04230 5.563380 *
##      5) volatile.acidity< 0.2275 761  447.36400 5.971091 *
##    3) alcohol>=10.85 1378 1070.08200 6.328737  
##      6) free.sulfur.dioxide< 10.5 84   95.55952 5.369048 *
##      7) free.sulfur.dioxide>=10.5 1294  892.13600 6.391036  
##       14) alcohol< 11.76667 629  430.11130 6.173291  
##         28) volatile.acidity>=0.465 11   10.72727 4.545455 *
##         29) volatile.acidity< 0.465 618  389.71680 6.202265 *
##       15) alcohol>=11.76667 665  403.99400 6.596992 *
rpart.plot(m.rpart, digits=3)

rpart.plot(m.rpart, digits=4, fallen.leaves = TRUE, type = 3, extra = 101)

Step 4: Evaluating Model Performance

p.rpart=predict(m.rpart, wine_test)
summary(p.rpart)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.545   5.563   5.971   5.893   6.202   6.597
summary(wine_test$quality)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.901   6.000   9.000
cor(p.rpart, wine_test$quality)
## [1] 0.5369525
#Calculate RMSE 
#Calculate RMSE 
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.0.3
rmse(wine_test$quality,p.rpart)
## [1] 0.7448093
#Another method
sqrt(mean((wine_test$quality - p.rpart)^2))
## [1] 0.7448093

What is your interpretation about this amount of RMSE?

Smaller the value of RMSE , the model is better.The value of RMSE is high which tells us that the model is not a good fit.

Method #4. News Popularity

Step 1: Collecting the Data

online <- read_csv("C:/Users/Gautam/OneDrive/HU/3rdsem/ANLY530/Assignment530/OnlineNewsPopularity_for_R.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   url = col_character()
## )
## See spec(...) for full column specifications.
str(online)
## tibble [39,644 x 61] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ url                          : chr [1:39644] "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
##  $ timedelta                    : num [1:39644] 731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : num [1:39644] 12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : num [1:39644] 219 255 211 531 1072 ...
##  $ n_unique_tokens              : num [1:39644] 0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num [1:39644] 1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num [1:39644] 0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : num [1:39644] 4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : num [1:39644] 2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : num [1:39644] 1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : num [1:39644] 0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num [1:39644] 4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : num [1:39644] 5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : num [1:39644] 0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: num [1:39644] 1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : num [1:39644] 0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : num [1:39644] 0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : num [1:39644] 0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num [1:39644] 496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num [1:39644] 496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num [1:39644] 496 0 918 0 3151 ...
##  $ weekday_is_monday            : num [1:39644] 1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num [1:39644] 0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num [1:39644] 0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num [1:39644] 0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num [1:39644] 0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num [1:39644] 0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num [1:39644] 0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num [1:39644] 0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num [1:39644] 0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num [1:39644] 0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num [1:39644] 0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num [1:39644] 0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num [1:39644] 0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num [1:39644] 0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num [1:39644] 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num [1:39644] -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num [1:39644] -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num [1:39644] -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num [1:39644] 0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num [1:39644] -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num [1:39644] 0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num [1:39644] 0.188 0 0 0 0.136 ...
##  $ shares                       : num [1:39644] 593 711 1500 1200 505 855 556 891 3600 710 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   url = col_character(),
##   ..   timedelta = col_double(),
##   ..   n_tokens_title = col_double(),
##   ..   n_tokens_content = col_double(),
##   ..   n_unique_tokens = col_double(),
##   ..   n_non_stop_words = col_double(),
##   ..   n_non_stop_unique_tokens = col_double(),
##   ..   num_hrefs = col_double(),
##   ..   num_self_hrefs = col_double(),
##   ..   num_imgs = col_double(),
##   ..   num_videos = col_double(),
##   ..   average_token_length = col_double(),
##   ..   num_keywords = col_double(),
##   ..   data_channel_is_lifestyle = col_double(),
##   ..   data_channel_is_entertainment = col_double(),
##   ..   data_channel_is_bus = col_double(),
##   ..   data_channel_is_socmed = col_double(),
##   ..   data_channel_is_tech = col_double(),
##   ..   data_channel_is_world = col_double(),
##   ..   kw_min_min = col_double(),
##   ..   kw_max_min = col_double(),
##   ..   kw_avg_min = col_double(),
##   ..   kw_min_max = col_double(),
##   ..   kw_max_max = col_double(),
##   ..   kw_avg_max = col_double(),
##   ..   kw_min_avg = col_double(),
##   ..   kw_max_avg = col_double(),
##   ..   kw_avg_avg = col_double(),
##   ..   self_reference_min_shares = col_double(),
##   ..   self_reference_max_shares = col_double(),
##   ..   self_reference_avg_sharess = col_double(),
##   ..   weekday_is_monday = col_double(),
##   ..   weekday_is_tuesday = col_double(),
##   ..   weekday_is_wednesday = col_double(),
##   ..   weekday_is_thursday = col_double(),
##   ..   weekday_is_friday = col_double(),
##   ..   weekday_is_saturday = col_double(),
##   ..   weekday_is_sunday = col_double(),
##   ..   is_weekend = col_double(),
##   ..   LDA_00 = col_double(),
##   ..   LDA_01 = col_double(),
##   ..   LDA_02 = col_double(),
##   ..   LDA_03 = col_double(),
##   ..   LDA_04 = col_double(),
##   ..   global_subjectivity = col_double(),
##   ..   global_sentiment_polarity = col_double(),
##   ..   global_rate_positive_words = col_double(),
##   ..   global_rate_negative_words = col_double(),
##   ..   rate_positive_words = col_double(),
##   ..   rate_negative_words = col_double(),
##   ..   avg_positive_polarity = col_double(),
##   ..   min_positive_polarity = col_double(),
##   ..   max_positive_polarity = col_double(),
##   ..   avg_negative_polarity = col_double(),
##   ..   min_negative_polarity = col_double(),
##   ..   max_negative_polarity = col_double(),
##   ..   title_subjectivity = col_double(),
##   ..   title_sentiment_polarity = col_double(),
##   ..   abs_title_subjectivity = col_double(),
##   ..   abs_title_sentiment_polarity = col_double(),
##   ..   shares = col_double()
##   .. )
#I have selected the columns which might be helpful in determining the newS popularity
newonline <- online[,c(3,4,5,7,10,11,12,13,14:19,24,26,30,31,39,45:49,54,58,57,61)]
dim(newonline)
## [1] 39644    28
summary(newonline$n_tokens_title)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0     9.0    10.0    10.4    12.0    23.0

Step 2: Pre-processing

newonline$shares <- factor(ifelse(newonline$shares>=1400, "favorite", "notfavorite"))
head(newonline$shares)
## [1] notfavorite notfavorite favorite    notfavorite notfavorite notfavorite
## Levels: favorite notfavorite
set.seed(12345) 
online_rand <-  newonline[order(runif(24000)), ]
summary(online_rand$n_tokens_title)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    9.00   10.00   10.04   11.00   19.00
online_train <- online_rand[1:18000, ]
online_test <- online_rand[18001:24000, ]

prop.table(table(online_train$shares))
## 
##    favorite notfavorite 
##   0.5461667   0.4538333
prop.table(table(online_test$shares))
## 
##    favorite notfavorite 
##       0.551       0.449

Step 3a: Training a model on the data (Decision tress)

online_model <- C5.0(x = online_train[-28], y = online_train$shares)
online_model
## 
## Call:
## C5.0.default(x = online_train[-28], y = online_train$shares)
## 
## Classification Tree
## Number of samples: 18000 
## Number of predictors: 27 
## 
## Tree size: 157 
## 
## Non-standard options: attempt to group attributes

Step 4:Evaluation model performance(Decision Tree)

online_pred <- predict(online_model, online_test)
(p <- table(online_pred,online_test$shares))
##              
## online_pred   favorite notfavorite
##   favorite        2337        1229
##   notfavorite      969        1465
CrossTable(online_test$shares, online_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Shares', 'Predicted shares'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  6000 
## 
##  
##               | Predicted shares 
## Actual Shares |    favorite | notfavorite |   Row Total | 
## --------------|-------------|-------------|-------------|
##      favorite |        2337 |         969 |        3306 | 
##               |       0.390 |       0.162 |             | 
## --------------|-------------|-------------|-------------|
##   notfavorite |        1229 |        1465 |        2694 | 
##               |       0.205 |       0.244 |             | 
## --------------|-------------|-------------|-------------|
##  Column Total |        3566 |        2434 |        6000 | 
## --------------|-------------|-------------|-------------|
## 
## 
p_online <- table(online_pred,online_test$shares)
Accuracy <- sum(diag(p_online))/sum(p_online)*100
Accuracy
## [1] 63.36667

The table indicates that for the 6000 records in our test set 969 cases were misclassified, i.e. false negatives or a Type II error, and 1229 actual defaults were misclassified as favorite, i.e. false positives or a Type I error.

Step 3b: Training a model on the data (Random forest)

random_online_model <- randomForest(shares ~ . , data= online_train)
summary(random_online_model)
##                 Length Class  Mode     
## call                3  -none- call     
## type                1  -none- character
## predicted       18000  factor numeric  
## err.rate         1500  -none- numeric  
## confusion           6  -none- numeric  
## votes           36000  matrix numeric  
## oob.times       18000  -none- numeric  
## classes             2  -none- character
## importance         27  -none- numeric  
## importanceSD        0  -none- NULL     
## localImportance     0  -none- NULL     
## proximity           0  -none- NULL     
## ntree               1  -none- numeric  
## mtry                1  -none- numeric  
## forest             14  -none- list     
## y               18000  factor numeric  
## test                0  -none- NULL     
## inbag               0  -none- NULL     
## terms               3  terms  call
importance(random_online_model)
##                               MeanDecreaseGini
## n_tokens_title                       295.10618
## n_tokens_content                     520.56843
## n_unique_tokens                      532.52159
## n_non_stop_unique_tokens             530.66563
## num_imgs                             243.03143
## num_videos                           142.61346
## average_token_length                 543.34142
## num_keywords                         270.29497
## data_channel_is_lifestyle             33.45672
## data_channel_is_entertainment         97.26446
## data_channel_is_bus                   55.29694
## data_channel_is_socmed               101.37706
## data_channel_is_tech                  63.70337
## data_channel_is_world                 84.09383
## kw_max_max                           211.64429
## kw_min_avg                           462.06308
## self_reference_max_shares            430.49478
## self_reference_avg_sharess           533.14965
## is_weekend                           175.69206
## global_subjectivity                  565.15412
## global_sentiment_polarity            512.15437
## global_rate_positive_words           515.59124
## global_rate_negative_words           479.27852
## rate_positive_words                  438.92213
## avg_negative_polarity                491.61026
## title_sentiment_polarity             296.59315
## title_subjectivity                   273.42593

Step 4: Model evaluation(Random Forest)

random_online_pred <- predict(random_online_model,online_test)
p_random <- table(random_online_pred,online_test$shares)
Accuracy_random <- sum(diag(p_random))/sum(p_random)*100
Accuracy_random
## [1] 64.38333

Random Forest Model is the best model as it gives the highest accuracy 64.71% .Important features in determining the news popularity are (global_subjectivity,average_token_length,n_unique_tokens,n_tokens_content,self_reference_avg_sharess) which shows that the online new popularity depends mostly upon the (Global) Subject of the article , Average length of article , content and uniqueness of article.