Understanding different classification techniques using original examples and datasets

Method 1: Tree-based Classification

Tree based Analysis with Credit Data

The data set used for this method is the credit.csv file.

## Step 1: Collecting the data ##
library(moments)

# Read the data file
credit <- read.csv(file="C:\\Users\\Owner\\Dropbox\\Harrisburg Semesters\\ANLY 530\\Lab1\\credit.csv")

# To read the structure of data frame
str(credit)
## 'data.frame':    1000 obs. of  17 variables:
##  $ checking_balance    : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
##  $ months_loan_duration: int  6 48 12 42 24 36 24 36 12 30 ...
##  $ credit_history      : Factor w/ 5 levels "critical","good",..: 1 2 1 2 4 2 2 2 2 1 ...
##  $ purpose             : Factor w/ 6 levels "business","car",..: 5 5 4 5 2 4 5 2 5 2 ...
##  $ amount              : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ savings_balance     : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
##  $ employment_duration : Factor w/ 5 levels "< 1 year","> 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
##  $ percent_of_income   : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ years_at_residence  : int  4 2 3 4 4 4 4 2 4 2 ...
##  $ age                 : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ other_credit        : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ housing             : Factor w/ 3 levels "other","own",..: 2 2 2 1 1 1 2 3 2 2 ...
##  $ existing_loans_count: int  2 1 1 1 2 1 1 1 1 2 ...
##  $ job                 : Factor w/ 4 levels "management","skilled",..: 2 2 4 2 2 4 2 1 4 1 ...
##  $ dependents          : int  1 1 2 2 2 2 1 1 1 1 ...
##  $ phone               : Factor w/ 2 levels "no","yes": 2 1 1 1 1 2 1 2 1 1 ...
##  $ default             : Factor w/ 2 levels "no","yes": 1 2 1 1 2 1 1 1 1 2 ...
## Step 2: Exploring the data ##
# Taking summary of model function amount 
summary(credit$amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
# Checking the total defaults within the credit data
table(credit$default)
## 
##  no yes 
## 700 300
# Taking 1000 random records from the data set
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]

# Comparing the summary of original object and new object, should be unchanged
summary(credit$amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
summary(credit_rand$amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
# Need to use between 75% and 90% of the records to establish the training set and test set 
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]

# Checking the percentages between splits for training set and test set
prop.table(table(credit_train$default))
## 
##        no       yes 
## 0.7022222 0.2977778
prop.table(table(credit_test$default))
## 
##   no  yes 
## 0.68 0.32
## Step 3: Training a model on the data ##
# Putting C5.0 algorithm into action
library(C50)

# Setting one command line to create the decision tree model
credit_model <- C5.0.default(x = credit_train[-17], y = credit_train$default)

# Using summary to examine the decision tree
summary(credit_model)
## 
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Tue Dec 18 18:20:41 2018
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 900 cases (17 attributes) from undefined.data
## 
## Decision tree:
## 
## checking_balance = unknown: no (358/44)
## checking_balance in {< 0 DM,> 200 DM,1 - 200 DM}:
## :...credit_history in {perfect,very good}:
##     :...dependents > 1: yes (10/1)
##     :   dependents <= 1:
##     :   :...savings_balance = < 100 DM: yes (39/11)
##     :       savings_balance in {> 1000 DM,500 - 1000 DM,unknown}: no (8/1)
##     :       savings_balance = 100 - 500 DM:
##     :       :...checking_balance = < 0 DM: no (1)
##     :           checking_balance in {> 200 DM,1 - 200 DM}: yes (5/1)
##     credit_history in {critical,good,poor}:
##     :...months_loan_duration <= 11: no (87/14)
##         months_loan_duration > 11:
##         :...savings_balance = > 1000 DM: no (13)
##             savings_balance in {< 100 DM,100 - 500 DM,500 - 1000 DM,unknown}:
##             :...checking_balance = > 200 DM:
##                 :...dependents > 1: yes (3)
##                 :   dependents <= 1:
##                 :   :...credit_history in {good,poor}: no (23/3)
##                 :       credit_history = critical:
##                 :       :...amount <= 2337: yes (3)
##                 :           amount > 2337: no (6)
##                 checking_balance = 1 - 200 DM:
##                 :...savings_balance = unknown: no (34/6)
##                 :   savings_balance in {< 100 DM,100 - 500 DM,500 - 1000 DM}:
##                 :   :...months_loan_duration > 45: yes (11/1)
##                 :       months_loan_duration <= 45:
##                 :       :...other_credit = store:
##                 :           :...age <= 35: yes (4)
##                 :           :   age > 35: no (2)
##                 :           other_credit = bank:
##                 :           :...years_at_residence <= 1: no (3)
##                 :           :   years_at_residence > 1:
##                 :           :   :...existing_loans_count <= 1: yes (5)
##                 :           :       existing_loans_count > 1:
##                 :           :       :...percent_of_income <= 2: no (4/1)
##                 :           :           percent_of_income > 2: yes (3)
##                 :           other_credit = none:
##                 :           :...job = unemployed: no (1)
##                 :               job = management:
##                 :               :...amount <= 7511: no (10/3)
##                 :               :   amount > 7511: yes (7)
##                 :               job = unskilled: [S1]
##                 :               job = skilled:
##                 :               :...dependents <= 1: no (55/15)
##                 :                   dependents > 1:
##                 :                   :...age <= 34: no (3)
##                 :                       age > 34: yes (4)
##                 checking_balance = < 0 DM:
##                 :...job = management: no (26/6)
##                     job = unemployed: yes (4/1)
##                     job = unskilled:
##                     :...employment_duration in {4 - 7 years,
##                     :   :                       unemployed}: no (4)
##                     :   employment_duration = < 1 year:
##                     :   :...other_credit = bank: no (1)
##                     :   :   other_credit in {none,store}: yes (11/2)
##                     :   employment_duration = > 7 years:
##                     :   :...other_credit in {bank,none}: no (5/1)
##                     :   :   other_credit = store: yes (2)
##                     :   employment_duration = 1 - 4 years:
##                     :   :...age <= 39: no (14/3)
##                     :       age > 39:
##                     :       :...credit_history in {critical,good}: yes (3)
##                     :           credit_history = poor: no (1)
##                     job = skilled:
##                     :...credit_history = poor:
##                         :...savings_balance in {< 100 DM,100 - 500 DM,
##                         :   :                   500 - 1000 DM}: yes (8)
##                         :   savings_balance = unknown: no (1)
##                         credit_history = critical:
##                         :...other_credit = store: no (0)
##                         :   other_credit = bank: yes (4)
##                         :   other_credit = none:
##                         :   :...savings_balance in {100 - 500 DM,
##                         :       :                   unknown}: no (1)
##                         :       savings_balance = 500 - 1000 DM: yes (1)
##                         :       savings_balance = < 100 DM:
##                         :       :...months_loan_duration <= 13:
##                         :           :...percent_of_income <= 3: yes (3)
##                         :           :   percent_of_income > 3: no (3/1)
##                         :           months_loan_duration > 13:
##                         :           :...amount <= 5293: no (10/1)
##                         :               amount > 5293: yes (2)
##                         credit_history = good:
##                         :...existing_loans_count > 1: yes (5)
##                             existing_loans_count <= 1:
##                             :...other_credit = store: no (2)
##                                 other_credit = bank:
##                                 :...percent_of_income <= 2: yes (2)
##                                 :   percent_of_income > 2: no (6/1)
##                                 other_credit = none: [S2]
## 
## SubTree [S1]
## 
## employment_duration in {< 1 year,1 - 4 years}: yes (11/3)
## employment_duration in {> 7 years,4 - 7 years,unemployed}: no (8)
## 
## SubTree [S2]
## 
## savings_balance = 100 - 500 DM: yes (3)
## savings_balance = 500 - 1000 DM: no (1)
## savings_balance = unknown:
## :...phone = no: yes (9/1)
## :   phone = yes: no (3/1)
## savings_balance = < 100 DM:
## :...percent_of_income <= 1: no (4)
##     percent_of_income > 1:
##     :...phone = yes: yes (10/1)
##         phone = no:
##         :...purpose in {business,car0,education,renovations}: yes (3)
##             purpose = car:
##             :...percent_of_income <= 3: no (2)
##             :   percent_of_income > 3: yes (6/1)
##             purpose = furniture/appliances:
##             :...years_at_residence <= 1: no (4)
##                 years_at_residence > 1:
##                 :...housing = other: no (1)
##                     housing = rent: yes (2)
##                     housing = own:
##                     :...amount <= 1778: no (3)
##                         amount > 1778:
##                         :...years_at_residence <= 3: yes (6)
##                             years_at_residence > 3: no (3/1)
## 
## 
## Evaluation on training data (900 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      66  125(13.9%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     609    23    (a): class no
##     102   166    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% checking_balance
##   60.22% credit_history
##   53.22% months_loan_duration
##   49.44% savings_balance
##   30.89% job
##   25.89% other_credit
##   17.78% dependents
##    9.67% existing_loans_count
##    7.22% percent_of_income
##    6.67% employment_duration
##    5.78% phone
##    5.56% amount
##    3.78% years_at_residence
##    3.44% age
##    3.33% purpose
##    1.67% housing
## 
## 
## Time: 0.1 secs
# Based on the evaluation above for this output the algorithm properly classified 609 records as (a), 166 records as (b) and the remaining 125 records create a 13.9% (misclassification) error in training.

## Step 4: Evaluating Model Performance ##
# Predicting test set model's overall performance
cred_pred <- predict(credit_model, credit_test)

# Creating a confusion table looking for the predicted and actual values using the training and test sets
library(gmodels)

# Output of the CrossTable for Credit Data
CrossTable(credit_test$default, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actual default', 'predicted default'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                | predicted default 
## actual default |        no |       yes | Row Total | 
## ---------------|-----------|-----------|-----------|
##             no |        57 |        11 |        68 | 
##                |     0.570 |     0.110 |           | 
## ---------------|-----------|-----------|-----------|
##            yes |        16 |        16 |        32 | 
##                |     0.160 |     0.160 |           | 
## ---------------|-----------|-----------|-----------|
##   Column Total |        73 |        27 |       100 | 
## ---------------|-----------|-----------|-----------|
## 
## 
# Predicted no, actual no 57
# Predicted yes, actual no 11
# Predicted no, actual yes 16
# Predicted yes, actual yes 16 
# The table indicates that for the 100 records in our test set, 11 defaults were misclassified, i.e. false negatives or a Type II error, and 16 actual defaults were misclassified as not defaulting, i.e. false positives or a Type I error.

Method 2: Support Vector Machine (SVM)

Letter Recognition with Support Vector Machine

The data set used for this method is the letterdata.csv file.

## Step 1: Collecting the data ##
library(moments)

# Read the data file
letters <- read.csv(file="C:\\Users\\Owner\\Dropbox\\Harrisburg Semesters\\ANLY 530\\Lab1\\letterdata.csv")

# To read the structure of data frame
str(letters)
## 'data.frame':    20000 obs. of  17 variables:
##  $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
##  $ xbox  : int  2 5 4 7 2 4 4 1 2 11 ...
##  $ ybox  : int  8 12 11 11 1 11 2 1 2 15 ...
##  $ width : int  3 3 6 6 3 5 5 3 4 13 ...
##  $ height: int  5 7 8 6 1 8 4 2 4 9 ...
##  $ onpix : int  1 2 6 3 1 3 4 1 2 7 ...
##  $ xbar  : int  8 10 10 5 8 8 8 8 10 13 ...
##  $ ybar  : int  13 5 6 9 6 8 7 2 6 2 ...
##  $ x2bar : int  0 5 2 4 6 6 6 2 2 6 ...
##  $ y2bar : int  6 4 6 6 6 9 6 2 6 2 ...
##  $ xybar : int  6 13 10 4 6 5 7 8 12 12 ...
##  $ x2ybar: int  10 3 3 4 5 6 6 2 4 1 ...
##  $ xy2bar: int  8 9 7 10 9 6 6 8 8 9 ...
##  $ xedge : int  0 2 3 6 1 0 2 1 1 8 ...
##  $ xedgey: int  8 8 7 10 7 8 8 6 6 1 ...
##  $ yedge : int  0 4 3 2 5 9 7 2 1 1 ...
##  $ yedgex: int  8 10 9 8 10 7 10 7 7 8 ...
## Step 2: Preparing the data ##
#  Need to use between 75% and 90% of the records to establish the training set and test set, taking (90%) 18000 random records from the data set for training
letters_train <- letters[1:18000, ]
letters_test <- letters[18001:20000, ]

## Step 3: Training a model on the data ##
# The kernalab package is used for kernal based help vector machine (ksvm) work. 
library(kernlab)
# Set the kernel to the 'vanilladot,' one of the kernal types that will be utilized for making a nonlinear kernel, a svm model. SVM model is then built as letter_classifier. 
letter_classifier <- ksvm(letter ~ ., data = letters_train, kernel = "vanilladot")
##  Setting default kernel parameters
letter_classifier
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 7886 
## 
## Objective Function Value : -15.3458 -21.3403 -25.7672 -6.8685 -8.8812 -35.9555 -59.5883 -18.1975 -65.6075 -41.5654 -18.8559 -39.3558 -36.9961 -60.3052 -15.1694 -42.144 -35.0941 -19.4069 -15.8234 -38.6718 -33.3013 -8.5298 -12.4387 -38.2194 -14.3682 -9.5508 -165.7154 -53.2778 -79.2163 -134.5053 -184.4809 -58.9285 -46.3252 -81.004 -28.1341 -29.6955 -27.5983 -38.1764 -47.2889 -137.0497 -208.1396 -239.2616 -23.8945 -10.9655 -64.228 -12.2139 -55.7818 -10.8001 -21.2407 -11.1795 -121.5639 -33.2229 -267.3926 -81.0708 -9.4937 -4.6577 -161.5171 -86.7114 -20.9146 -16.8272 -86.6582 -16.7205 -30.3036 -20.0054 -26.2331 -29.9289 -56.1072 -11.6335 -5.2564 -14.8153 -4.983 -4.8171 -8.5044 -43.2267 -55.9 -214.755 -47.0748 -49.6539 -50.2278 -18.3767 -19.1813 -97.6132 -113.6502 -42.4112 -32.5859 -127.4807 -33.7418 -30.7568 -40.0953 -18.6792 -5.4826 -49.3916 -10.6142 -20.0286 -63.8287 -183.8297 -57.0671 -43.3721 -35.2783 -85.4451 -145.9585 -11.8002 -6.1194 -12.5323 -33.5245 -155.2248 -57.2602 -194.0785 -111.0155 -10.8207 -16.7926 -3.7766 -77.3561 -7.9004 -106.5759 -52.523 -107.0402 -78.0148 -74.4773 -24.8166 -13.2372 -7.8706 -27.2788 -13.2342 -280.2869 -32.7288 -25.9531 -149.5447 -153.8495 -10.0146 -40.8917 -6.7333 -65.2053 -72.818 -35.1252 -246.7046 -38.0738 -16.9126 -158.18 -184.0021 -50.8427 -28.7686 -164.5969 -97.8359 -386.1426 -160.3188 -181.8759 -38.3648 -37.2272 -60.116 -28.2074 -53.7383 -7.8729 -12.3159 -37.8942 -72.6434 -211.8342 -58.5023 -105.1605 -176.7259 -685.8994 -142.8147 -159.635 -366.9437 -37.6409 -73.1357 -175.1906 -131.2833 -41.1464 -77.8404 -57.8131 -8.6365 -251.3728 -14.0836 -36.5144 -2.2292 -6.1598 -16.8011 -26.5165 -67.19 -21.3366 -221.4815 -22.9219 -4.2616 -4.7901 -0.8263 -134.7538 -8.8843 -83.1109 -23.1019 -14.4251 -5.7337 -17.5244 -29.7925 -23.9243 -88.9084 -28.6719 -106.0564 -16.4981 -10.6486 -7.9315 -1.5742 -91.1706 -7.3819 -118.2628 -117.5543 -48.5606 -26.6093 -71.2968 -30.4913 -63.5712 -279.2921 -46.3025 -50.4912 -37.9431 -21.5243 -11.6202 -134.9023 -7.516 -5.8131 -10.1595 -13.6329 -27.0293 -25.7282 -151.8511 -39.0524 -105.4861 -34.2434 -15.7051 -10.2304 -3.6687 -98.2094 -7.4666 -15.2668 -75.1283 -116.5382 -16.6429 -14.9215 -55.1062 -3.0636 -8.4262 -93.6829 -38.1162 -123.1859 -4.9078 -9.1612 -1.3077 -102.9021 -23.1138 -8.5262 -57.2623 -3.4297 -20.9579 -78.2019 -50.3741 -62.3531 -6.4908 -21.9308 -2.3736 -84.3835 -126.3997 -114.8723 -26.4109 -21.5589 -61.6405 -34.9162 -66.3243 -25.1148 -6.7203 -4.6695 -65.3518 -39.7924 -67.3505 -36.2154 -10.9031 -62.2195 -14.9491 -24.3238 -65.0847 -4.9657 -64.2797 -278.2873 -14.6902 -13.9198 -18.2059 -9.8972 -78.2645 -17.454 -49.5929 -55.7786 -28.7673 -15.9476 -47.531 -17.4379 -71.0516 -5.6899 -6.2519 -97.5508 -3.8196 -7.0502 -1.1238 -147.6952 -28.2018 -414.2586 -32.3275 -35.1191 -4.9605 -90.2307 -151.3409 -90.0329 -27.9491 -42.4688 -12.5118 -26.4828 -2.0045 -62.195 -9.1662 -178.4616 -1.9406 -1.9871 -11.3982 -0.5214 -29.6136 -35.0449 -6.7569 
## Training error : 0.1335
# The result shows a training error of 13.35%

## Step 4: Evaluating Model Performance ##
# Predicting test set model's overall performance
letter_predictions <- predict(letter_classifier, letters_test)

# Creating a confusion table for the predicted and actual values using the training and test sets
table(letter_predictions, letters_test$letter)
##                   
## letter_predictions  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P  Q  R
##                  A 73  0  0  0  0  0  0  0  0  1  0  0  0  0  3  0  4  0
##                  B  0 61  0  3  2  0  1  1  0  0  1  1  0  0  0  2  0  1
##                  C  0  0 64  0  2  0  4  2  1  0  1  2  0  0  1  0  0  0
##                  D  2  1  0 67  0  0  1  3  3  2  1  2  0  3  4  2  1  2
##                  E  0  0  1  0 64  1  1  0  0  0  2  2  0  0  0  0  2  0
##                  F  0  0  0  0  0 70  1  1  4  0  0  0  0  0  0  5  1  0
##                  G  1  1  2  1  3  2 68  1  0  0  0  1  0  0  0  0  4  1
##                  H  0  0  0  1  0  1  0 46  0  2  3  1  1  1  9  0  0  5
##                  I  0  0  0  0  0  0  0  0 65  3  0  0  0  0  0  0  0  0
##                  J  0  1  0  0  0  1  0  0  3 61  0  0  0  0  1  0  0  0
##                  K  0  1  4  0  0  0  0  5  0  0 56  0  0  2  0  0  0  4
##                  L  0  0  0  0  1  0  0  1  0  0  0 63  0  0  0  0  0  0
##                  M  0  0  1  0  0  0  1  0  0  0  0  0 70  2  0  0  0  0
##                  N  0  0  0  0  0  0  0  0  0  0  0  0  0 77  0  0  0  1
##                  O  0  0  1  1  0  0  0  1  0  1  0  0  0  0 49  1  2  0
##                  P  0  0  0  0  0  3  0  0  0  0  0  0  0  0  2 69  0  0
##                  Q  0  0  0  0  0  0  3  1  0  0  0  2  0  0  2  1 52  0
##                  R  0  4  0  0  1  0  0  3  0  0  3  0  0  0  1  0  0 64
##                  S  0  1  0  0  1  1  1  0  1  1  0  0  0  0  0  0  6  0
##                  T  0  0  0  0  1  1  0  0  0  0  1  0  0  0  0  0  0  0
##                  U  0  0  2  1  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##                  V  0  0  0  0  0  0  0  0  0  0  0  0  1  0  1  0  0  0
##                  W  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0
##                  X  0  1  0  0  1  0  0  1  0  0  1  4  0  0  0  0  0  1
##                  Y  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  4  0  0
##                  Z  1  0  0  0  2  0  0  0  0  2  0  0  0  0  0  0  0  0
##                   
## letter_predictions  S  T  U  V  W  X  Y  Z
##                  A  0  1  2  0  1  0  0  0
##                  B  3  0  0  0  0  0  0  0
##                  C  0  0  0  0  0  0  0  0
##                  D  0  0  0  0  0  0  1  0
##                  E  6  0  0  0  0  1  0  0
##                  F  2  0  0  1  0  0  2  0
##                  G  3  2  0  0  0  0  0  0
##                  H  0  3  0  2  0  0  1  0
##                  I  2  0  0  0  0  2  1  0
##                  J  1  0  0  0  0  1  0  4
##                  K  0  1  2  0  0  4  0  0
##                  L  0  0  0  0  0  0  0  0
##                  M  0  0  1  0  6  0  0  0
##                  N  0  0  1  0  2  0  0  0
##                  O  0  0  1  0  0  0  0  0
##                  P  0  0  0  0  0  0  1  0
##                  Q  1  0  0  0  0  0  0  0
##                  R  0  1  0  1  0  0  0  0
##                  S 47  1  0  0  0  1  0  6
##                  T  1 83  1  0  0  0  2  2
##                  U  0  0 83  0  0  0  0  0
##                  V  0  0  0 64  1  0  1  0
##                  W  0  0  0  3 59  0  0  0
##                  X  0  0  0  0  0 76  1  0
##                  Y  0  1  0  0  0  1 58  0
##                  Z  5  1  0  0  0  0  0 70
# Values in the diagonal are correctly predicted letters from the tested dataset

agreement <- letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE  TRUE 
##   321  1679
# This shows that the letters were classified correctly in 1679 test records out of 2000.

prop.table(table(agreement))
## agreement
##  FALSE   TRUE 
## 0.1605 0.8395
# Results with SVM model using a vanilladot kernel, accuracy factor is approximately 83.95%.

Method 3: Adding Regression to Trees

White Wine Quality Analysis with Adding Regression to Trees

The data set used for this method is the whitewines.csv file.

## Step 1: Collecting the data ##
library(moments)

# Read the data file
wine <- read.csv(file="C:\\Users\\Owner\\Dropbox\\Harrisburg Semesters\\ANLY 530\\Lab1\\whitewines.csv")

# To read the structure of data frame
str(wine)
## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
##  $ volatile.acidity    : num  0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
##  $ citric.acid         : num  0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
##  $ residual.sugar      : num  1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
##  $ chlorides           : num  0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
##  $ free.sulfur.dioxide : num  6 41 33 11 36 22 33 17 34 40 ...
##  $ total.sulfur.dioxide: num  62 113 123 74 119 95 152 67 90 130 ...
##  $ density             : num  0.993 0.999 0.995 0.991 0.993 ...
##  $ pH                  : num  3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
##  $ sulphates           : num  0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
##  $ alcohol             : num  10.4 8.9 10.1 11.2 10.9 ...
##  $ quality             : int  5 6 6 4 6 6 6 6 6 7 ...
# we will examine the distribution of quality of wine using a histogram
hist(wine$quality)

# The data for wine quality appears to be normal with centre at value 6

## Step 2: Exploring and Preparing the Data ##
# Need to use between 75% and 90% of the records to establish the training set and test set 
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]

## Step 3: Training a model on the data ##
# We will begin by training a regression tree model. We will use rpart package means recursive partitioning. This will train a tree using the default settings
library(rpart)

# The rpart() will be used to specify quality as the outcome variable and use the dot notation to allow all the other columns in the wine_train data frame to be used in predictors
m.rpart <- rpart(quality ~ ., data=wine_train)
m.rpart
## n= 3750 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3750 2945.53200 5.870933  
##    2) alcohol< 10.85 2372 1418.86100 5.604975  
##      4) volatile.acidity>=0.2275 1611  821.30730 5.432030  
##        8) volatile.acidity>=0.3025 688  278.97670 5.255814 *
##        9) volatile.acidity< 0.3025 923  505.04230 5.563380 *
##      5) volatile.acidity< 0.2275 761  447.36400 5.971091 *
##    3) alcohol>=10.85 1378 1070.08200 6.328737  
##      6) free.sulfur.dioxide< 10.5 84   95.55952 5.369048 *
##      7) free.sulfur.dioxide>=10.5 1294  892.13600 6.391036  
##       14) alcohol< 11.76667 629  430.11130 6.173291  
##         28) volatile.acidity>=0.465 11   10.72727 4.545455 *
##         29) volatile.acidity< 0.465 618  389.71680 6.202265 *
##       15) alcohol>=11.76667 665  403.99400 6.596992 *
# rpart function automatically determines the most important predictor variable "alcohol". Of all the 3750 samples, 2372 have alcohol < 10.85 and 1378 have alcohol >= 10.85

# Although the tree can be understood using the preceding output, it is more easily understood using visualization. The rpart.plot package provides an easy-to-use function that produces publication-quality decision trees
library(rpart.plot)

# The digits parameter controls the number of numeric digits to be included in the diagram. The rpart.plot function creates a tree diagram
rpart.plot(m.rpart, digits=3)

# The fallen.leaves parameter forces the leaf nodes to be aligned at the bottom of the plot, while the type and extra parameters affect the way the decisions and nodes are labeled
rpart.plot(m.rpart, digits=4, fallen.leaves = TRUE, type = 3, extra = 101)

## Step 4: Evaluating Model Performance ##
# Predicting test set model's overall performance
p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.545   5.563   5.971   5.893   6.202   6.597
summary(wine_test$quality)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.901   6.000   9.000
# This finding suggests that the model is not correctly identifying the extreme cases, in particular the best and worst wines since the model range is much smaller than the actual data range

# Correlation between the predicted and actual values provides a way to gauge the model's performance
cor(p.rpart, wine_test$quality)
## [1] 0.5369525
# A correlation of 0.54 is certainly acceptable. However, the correlation only measures how strongly the predictions are related to the true value; it is not a measure of how far off the predictions were from the true values

Executing ‘OnlineNewsPopularity’ dataset using all 3 classification techniques and evaluating which method gives better results

Method 1: Tree-based Classification

Tree based Analysis with Online News Popularity dataset

## Step 1: Collecting the data ##
library(moments)

# Read the data file
news <- read.csv(file="C:\\Users\\Owner\\Dropbox\\Harrisburg Semesters\\ANLY 530\\Lab1\\OnlineNewsPopularity.csv")

# To read the structure of data frame
str(news)
## 'data.frame':    39644 obs. of  61 variables:
##  $ url                          : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ timedelta                    : num  731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : num  219 255 211 531 1072 ...
##  $ n_unique_tokens              : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : num  2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: num  1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : num  0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : num  0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num  496 0 918 0 3151 ...
##  $ weekday_is_monday            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num  0.188 0 0 0 0.136 ...
##  $ shares                       : int  593 711 1500 1200 505 855 556 891 3600 710 ...
## Step 2: Exploring the data ##
# Selcting features to determine the best features for popularity
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)

colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

# Converting popularity into categorical variable split at 1400
for(i in 1:39644) 
  {
  if(newsShort$shares[i] >= 1400) 
    {
     newsShort$shares[i] = "yes"} 
  else {newsShort$shares[i] = "no"}
   }
newsShort$shares <- as.factor(newsShort$shares)

# Taking 10000 random records from the data set
set.seed(12345)
news_rand <- newsShort[order(runif(10000)), ]

# Need to use between 75% and 90% of the records to establish the training set and test set
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]

# Checking the percentages between splits for training set and test set
prop.table(table(news_train$shares))
## 
##        no       yes 
## 0.1882222 0.8117778
prop.table(table(news_test$shares))
## 
##    no   yes 
## 0.185 0.815
## Step 3: Training a model on the data ##
# Putting C5.0 algorithm into action
library(C50)

# Setting one command line to create the decision tree model
news_model <- C5.0(news_train[-17], news_train$shares)

# Using summary to examine the decision tree
summary(news_model)
## 
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Tue Dec 18 18:21:06 2018
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 9000 cases (17 attributes) from undefined.data
## 
## Decision tree:
##  yes (9000/1694)
## 
## 
## Evaluation on training data (9000 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       1 1694(18.8%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##          1694    (a): class no
##          7306    (b): class yes
## 
## 
## Time: 0.1 secs
# Based on the evaluation above for this output the algorithm properly classified 7306 records as (b) and the remaining 1694 records create a 18.8% (misclassification) error in training.

## Step 4: Evaluating Model Performance ##
# Predicting test set model's overall performance
news_pred <- predict(news_model, news_test)

# Creating a confusion table looking for the predicted and actual values using the training and test sets
library(gmodels)

# Output of the CrossTable for OnlineNewsPopularity Data
CrossTable(news_test$shares, news_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actual shares', 'predicted shares'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##                  | news_pred 
## news_test$shares |       yes | Row Total | 
## -----------------|-----------|-----------|
##               no |       185 |       185 | 
##                  |     0.185 |           | 
## -----------------|-----------|-----------|
##              yes |       815 |       815 | 
##                  |     0.815 |           | 
## -----------------|-----------|-----------|
##     Column Total |      1000 |      1000 | 
## -----------------|-----------|-----------|
## 
## 
# News predicted yes, News tested no 185
# News predicted yes, News tested yes 815
# The table indicates that for the 1000 records in our test set 185 records were misclassified, whereas 815 were correctly classified. Accuracy factor is 81.5%

Method 2: Support Vector Machine (SVM)

Online News Popularity analysis with Support Vector Machine

## Step 1: Collecting the data ##
library(moments)

# Read the data file
news <- read.csv(file="C:\\Users\\Owner\\Dropbox\\Harrisburg Semesters\\ANLY 530\\Lab1\\OnlineNewsPopularity.csv")

# To read the structure of data frame
str(news)
## 'data.frame':    39644 obs. of  61 variables:
##  $ url                          : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ timedelta                    : num  731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : num  219 255 211 531 1072 ...
##  $ n_unique_tokens              : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : num  2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: num  1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : num  0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : num  0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num  496 0 918 0 3151 ...
##  $ weekday_is_monday            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num  0.188 0 0 0 0.136 ...
##  $ shares                       : int  593 711 1500 1200 505 855 556 891 3600 710 ...
## Step 2: Preparing the data ##
#  Need to use between 75% and 90% of the records to establish the training set and test set, taking (90%) 18000 random records from the data set for training
news_train1 <- news_rand[1:9000, ]
news_test1 <- news_rand[9001:10000, ]

## Step 3: Training a model on the data ##
# The kernalab package is used for kernal based help vector machine (ksvm) work. 
library(kernlab)
# Set the kernel to the 'vanilladot,' one of the kernal types that will be utilized for making a nonlinear kernel, a svm model. SVM model is then built as letter_classifier. 
news_classifier <- ksvm(shares ~ ., data = news_train1, kernel = "vanilladot")
##  Setting default kernel parameters
news_classifier
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 3912 
## 
## Objective Function Value : -3388 
## Training error : 0.188222
# The result shows a training error of 18.82%

## Step 4: Evaluating Model Performance ##
# Predicting test set model's overall performance
news_predictions <- predict(news_classifier, news_test1)
# Creating a confusion table for the predicted and actual values using the training and test sets
table(news_predictions, news_test1$shares)
##                 
## news_predictions  no yes
##              no    0   0
##              yes 185 815
agreement1 <- news_predictions == news_test1$shares
table(agreement1)
## agreement1
## FALSE  TRUE 
##   185   815
# This shows that news predictions were classified correctly in 815 test records out of 1000.

prop.table(table(agreement1))
## agreement1
## FALSE  TRUE 
## 0.185 0.815
# Results with SVM model using a vanilladot kernel, accuracy factor is approximately 81.5%.

Method 3: Adding Regression to Trees

Online News Popularity analysis with Adding Regression to Trees

library(moments)

# Read the data file
news <- read.csv(file="C:\\Users\\Owner\\Dropbox\\Harrisburg Semesters\\ANLY 530\\Lab1\\OnlineNewsPopularity.csv")
news=news[, 29:61]
# To read the structure of data frame
str(news)
## 'data.frame':    39644 obs. of  33 variables:
##  $ self_reference_min_shares   : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares   : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess  : num  496 0 918 0 3151 ...
##  $ weekday_is_monday           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                      : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                      : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                      : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                      : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                      : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity         : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity   : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words  : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words  : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words         : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words         : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity       : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity       : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity       : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity       : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity       : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity       : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity          : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity    : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity      : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity: num  0.188 0 0 0 0.136 ...
##  $ shares                      : int  593 711 1500 1200 505 855 556 891 3600 710 ...
# we will examine the distribution of shares using a histogram
hist(news$shares)

# The data for shares is not distributed normally

## Step 2: Preparing the data ##
library(caTools)

set.seed(123)
news$popular= ifelse(news$avg_positive_polarity>0.5,1, 0)
news=news[,-23]
split=sample.split(news$popular , SplitRatio=0.8)

# Taking random records to establish training set and test set
news_train2=subset(news, split==TRUE)
news_test2=subset(news, split==FALSE)

## Step 3: Training a model on the data ##
# We will begin by training a regression tree model. We will use rpart package means recursive partitioning. This will train a tree using the default settings
library(rpart)

# The rpart() will be used to specify shares as the outcome variable and use the dot notation to allow all the other columns in the news_train2 data frame to be used in predictors
m.rpart <- rpart(popular ~ ., data=news_train2)
m.rpart
## n= 31716 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 31716 1517.48500 0.05038466  
##    2) global_sentiment_polarity< 0.3182905 30960 1168.24000 0.03927649  
##      4) min_positive_polarity< 0.2678571 30271  886.40280 0.03019391  
##        8) global_sentiment_polarity< 0.240115 28139  583.37640 0.02118057  
##         16) min_positive_polarity< 0.1431818 26114  378.35340 0.01470476 *
##         17) min_positive_polarity>=0.1431818 2025  189.80540 0.10469140  
##           34) max_positive_polarity< 0.8416667 1463   37.96036 0.02665755 *
##           35) max_positive_polarity>=0.8416667 562  119.74560 0.30782920 *
##        9) global_sentiment_polarity>=0.240115 2132  270.56850 0.14915570  
##         18) rate_positive_words>=0.7639319 1883  155.47110 0.09081253 *
##         19) rate_positive_words< 0.7639319 249   60.21687 0.59036140 *
##      5) min_positive_polarity>=0.2678571 689  169.62840 0.43831640  
##       10) max_positive_polarity< 0.575 292    0.00000 0.00000000 *
##       11) max_positive_polarity>=0.575 397   72.26700 0.76070530 *
##    3) global_sentiment_polarity>=0.3182905 756  188.97880 0.50529100  
##      6) max_positive_polarity< 0.825 185   27.75135 0.18378380 *
##      7) max_positive_polarity>=0.825 571  135.90890 0.60945710 *
# rpart function automatically determines the most important predictor variable 'global_sentiment_polarity'

# Although the tree can be understood using the preceding output, it is more easily understood using visualization. The rpart.plot package provides an easy-to-use function that produces publication-quality decision trees
library(rpart.plot)

# The digits parameter controls the number of numeric digits to be included in the diagram. The rpart.plot function creates a tree diagram
rpart.plot(m.rpart, digits=1)

# The fallen.leaves parameter forces the leaf nodes to be aligned at the bottom of the plot, while the type and extra parameters affect the way the decisions and nodes are labeled
rpart.plot(m.rpart, digits=1, fallen.leaves = TRUE, type = 3, extra = 101)

## Based on the tree node, the global_sentiment_polarity is most affected parameter for popularity. 

## We will run two correlation tests to check. As we can see, max_positive_polarity must be correlated with popularity

## Step 4: Evaluating Model Performance ##
# Predicting test set model's overall performance
p.rpart <- predict(m.rpart, news_test2)
summary(p.rpart)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0147  0.0147  0.0511  0.0147  0.7607
summary(news_test2$max_positive_polarity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.6000  0.8000  0.7532  1.0000  1.0000
summary(news_test2$popular)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.05033 0.00000 1.00000
# Correlation between the predicted and actual values provides a way to gauge the model's performance
cor(news_test2$max_positive_polarity, news_test2$popular)
## [1] 0.1760677
summary(news_test2$global_sentiment_polarity)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.37766  0.05737  0.12044  0.11976  0.17808  0.57374
summary(news_test2$popular)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.05033 0.00000 1.00000
cor(news_test2$global_sentiment_polarity, news_test2$popular)
## [1] 0.2417969
# This finding suggests that the model is not correctly identifying the extreme cases, since the model range is much smaller than the actual data range

# A correlation of 0.24 is certainly acceptable. However, the correlation only measures how strongly the predictions are related to the true value; it is not a measure of how far off the predictions were from the true values