options(repos = "https://cran.rstudio.com/")

Part 1 Decision Trees in R

Step 1

library(datasets)
library(mlbench)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

data("Sonar")
library("rpart") 
m <- rpart(Class ~ ., data = Sonar,
           method = "class")
library("rpart.plot")
rpart.plot(m)

p <- predict(m, Sonar, type = "class")
table(p, Sonar$Class)

##    
## p    M  R
##   M 95 10
##   R 16 87

set.seed(12)
model <- train(Class ~ .,
               data = Sonar,
               method = "ranger")
print(model)

## Random Forest 
## 
## 208 samples
##  60 predictor
##   2 classes: 'M', 'R' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 208, 208, 208, 208, 208, 208, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   Accuracy   Kappa    
##    2    gini        0.8078699  0.6108857
##    2    extratrees  0.8188579  0.6341083
##   31    gini        0.7751013  0.5455448
##   31    extratrees  0.8275348  0.6496807
##   60    gini        0.7676651  0.5302055
##   60    extratrees  0.8221403  0.6387246
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 31, splitrule = extratrees
##  and min.node.size = 1.

plot(model)

model <- train(Class ~ .,
               data = Sonar,
               method = "ranger",
               tuneLength = 5)

set.seed(42)
myGrid <- expand.grid(mtry = c(5, 10, 20, 40, 60),
                      splitrule = c("gini", "extratrees"),
                      min.node.size = 1) ## Minimal node size; default 1 for classification
model <- train(Class ~ .,
               data = Sonar,
               method = "ranger",
               tuneGrid = myGrid,
               trControl = trainControl(method = "cv",
                                       number = 5,
                                       verboseIter = FALSE))
print(model)

## Random Forest 
## 
## 208 samples
##  60 predictor
##   2 classes: 'M', 'R' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 166, 167, 167, 167, 165 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   Accuracy   Kappa    
##    5    gini        0.8173838  0.6297111
##    5    extratrees  0.8273668  0.6497566
##   10    gini        0.8171569  0.6305735
##   10    extratrees  0.8414310  0.6791146
##   20    gini        0.7978716  0.5908457
##   20    extratrees  0.8417740  0.6792866
##   40    gini        0.7879994  0.5727742
##   40    extratrees  0.8466467  0.6897164
##   60    gini        0.7928774  0.5822691
##   60    extratrees  0.8608271  0.7187262
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 60, splitrule = extratrees
##  and min.node.size = 1.

plot(model)

Step 2

library(rpart)
library(rpart.plot)
library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(caret)
library(MASS)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine()       masks randomForest::combine()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ purrr::lift()          masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ dplyr::select()        masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(parsnip)
library(party)

## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## 
## The following object is masked from 'package:parsnip':
## 
##     fit
## 
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
## 
## Attaching package: 'strucchange'
## 
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## 
## Attaching package: 'party'
## 
## The following object is masked from 'package:dplyr':
## 
##     where

chocolate <- read_csv("chocolate_tibble-3.csv")

## Rows: 1795 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): company_location, bean_type, broad_bean_origin
## dbl (3): final_grade, review_date, cocoa_percent
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

names(chocolate) <- make.names(names(chocolate)) # normalizes columns names

kable(head(chocolate,5), "html", caption="Table 1. Top 5 columns") %>%
kable_styling("striped")

Table 1. Top 5 columns
final_grade	review_date	cocoa_percent	company_location	bean_type	broad_bean_origin
3.75	2016	0.63	France	NA	Sao Tome
2.75	2015	0.70	France	NA	Togo
3.00	2015	0.70	France	NA	Togo
3.50	2015	0.70	France	NA	Togo
3.50	2015	0.70	France	NA	Peru

set.seed(3456)
trainIndex <- createDataPartition(chocolate$final_grade, p = .8,
                                  list = FALSE)
chocolate_train <- chocolate[ trainIndex,]
chocolate_test <- chocolate[-trainIndex,]

Constructing Regression Tree

spec <- decision_tree() %>%
set_mode("regression") %>%
set_engine("rpart")
print(spec)

## Decision Tree Model Specification (regression)
## 
## Computational engine: rpart

model <- spec %>%
parsnip::fit(formula = final_grade ~ .,data = chocolate_train)

model2 <- rpart(final_grade ~ cocoa_percent + company_location, data=chocolate_train, method="anova")

model3 <- ctree(
  final_grade ~ cocoa_percent + as.factor(company_location), 
  data = chocolate_train)

rpart.plot(model2, box.palette="RdBu", shadow.col="gray", nn=TRUE)

plot(model3)

Hyperparameters

decision_tree(tree_depth = 1) %>%
set_mode("regression") %>%
set_engine("rpart") %>%
parsnip::fit(formula = final_grade ~ .,
data = chocolate_train)

## parsnip model object
## 
## n= 1438 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 1438 321.2239 3.183936  
##   2) cocoa_percent>=0.885 31  11.5000 2.500000 *
##   3) cocoa_percent< 0.885 1407 294.9036 3.199005 *

Random Forest

library(MASS)
data(package="MASS")
boston<-Boston
dim(boston)

## [1] 506  14

names(boston)

##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"

train=sample(1:nrow(Boston),300)
Boston.rf=randomForest(medv ~ . , data = Boston , subset = train)

Plotting the Error vs Number of Trees Graph shows how the error is dropping as we are adding more trees and average them.

plot(Boston.rf)

Evaluating variable importance

importance(Boston.rf)

##         IncNodePurity
## crim       1167.57012
## zn          105.68616
## indus      1382.61992
## chas         76.04384
## nox        1448.30458
## rm         6039.15927
## age         616.75173
## dis        1167.19886
## rad         186.45117
## tax         874.09457
## ptratio    1697.38179
## black       480.12419
## lstat      6329.55370

varImpPlot(Boston.rf)

Part 2

library(readr)
library(psych)

## 
## Attaching package: 'psych'

## The following object is masked from 'package:randomForest':
## 
##     outlier

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(tidyverse)
library(corrplot)

## corrplot 0.92 loaded

library(Hmisc)

## 
## Attaching package: 'Hmisc'

## The following object is masked from 'package:psych':
## 
##     describe

## The following object is masked from 'package:parsnip':
## 
##     translate

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:modeltools':
## 
##     Predict

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

library(ppcor)
library(Metrics)

## 
## Attaching package: 'Metrics'

## The following objects are masked from 'package:caret':
## 
##     precision, recall

Data Exploration

data <- read_csv("Factor-Hair-Revised.csv")

## Rows: 100 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (13): ID, ProdQual, Ecom, TechSup, CompRes, Advertising, ProdLine, Sales...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dim(data)

## [1] 100  13

str(data)

## spc_tbl_ [100 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ID          : num [1:100] 1 2 3 4 5 6 7 8 9 10 ...
##  $ ProdQual    : num [1:100] 8.5 8.2 9.2 6.4 9 6.5 6.9 6.2 5.8 6.4 ...
##  $ Ecom        : num [1:100] 3.9 2.7 3.4 3.3 3.4 2.8 3.7 3.3 3.6 4.5 ...
##  $ TechSup     : num [1:100] 2.5 5.1 5.6 7 5.2 3.1 5 3.9 5.1 5.1 ...
##  $ CompRes     : num [1:100] 5.9 7.2 5.6 3.7 4.6 4.1 2.6 4.8 6.7 6.1 ...
##  $ Advertising : num [1:100] 4.8 3.4 5.4 4.7 2.2 4 2.1 4.6 3.7 4.7 ...
##  $ ProdLine    : num [1:100] 4.9 7.9 7.4 4.7 6 4.3 2.3 3.6 5.9 5.7 ...
##  $ SalesFImage : num [1:100] 6 3.1 5.8 4.5 4.5 3.7 5.4 5.1 5.8 5.7 ...
##  $ ComPricing  : num [1:100] 6.8 5.3 4.5 8.8 6.8 8.5 8.9 6.9 9.3 8.4 ...
##  $ WartyClaim  : num [1:100] 4.7 5.5 6.2 7 6.1 5.1 4.8 5.4 5.9 5.4 ...
##  $ OrdBilling  : num [1:100] 5 3.9 5.4 4.3 4.5 3.6 2.1 4.3 4.4 4.1 ...
##  $ DelSpeed    : num [1:100] 3.7 4.9 4.5 3 3.5 3.3 2 3.7 4.6 4.4 ...
##  $ Satisfaction: num [1:100] 8.2 5.7 8.9 4.8 7.1 4.7 5.7 6.3 7 5.5 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ID = col_double(),
##   ..   ProdQual = col_double(),
##   ..   Ecom = col_double(),
##   ..   TechSup = col_double(),
##   ..   CompRes = col_double(),
##   ..   Advertising = col_double(),
##   ..   ProdLine = col_double(),
##   ..   SalesFImage = col_double(),
##   ..   ComPricing = col_double(),
##   ..   WartyClaim = col_double(),
##   ..   OrdBilling = col_double(),
##   ..   DelSpeed = col_double(),
##   ..   Satisfaction = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

names(data)

##  [1] "ID"           "ProdQual"     "Ecom"         "TechSup"      "CompRes"     
##  [6] "Advertising"  "ProdLine"     "SalesFImage"  "ComPricing"   "WartyClaim"  
## [11] "OrdBilling"   "DelSpeed"     "Satisfaction"

describe(data)

## data 
## 
##  13  Variables      100  Observations
## --------------------------------------------------------------------------------
## ID 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0      100        1     50.5    33.67     5.95    10.90 
##      .25      .50      .75      .90      .95 
##    25.75    50.50    75.25    90.10    95.05 
## 
## lowest :   1   2   3   4   5, highest:  96  97  98  99 100
## --------------------------------------------------------------------------------
## ProdQual 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       43    0.999     7.81     1.61    5.595    5.790 
##      .25      .50      .75      .90      .95 
##    6.575    8.000    9.100    9.410    9.900 
## 
## lowest : 5   5.1 5.2 5.5 5.6, highest: 9.4 9.5 9.6 9.9 10 
## --------------------------------------------------------------------------------
## Ecom 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       27    0.996    3.672   0.7674    2.595    2.800 
##      .25      .50      .75      .90      .95 
##    3.275    3.600    3.925    4.530    5.100 
## 
## lowest : 2.2 2.4 2.5 2.6 2.7, highest: 4.9 5.1 5.5 5.6 5.7
## --------------------------------------------------------------------------------
## TechSup 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       50    0.999    5.365    1.755    2.700    3.280 
##      .25      .50      .75      .90      .95 
##    4.250    5.400    6.625    7.210    7.605 
## 
## lowest : 1.3 2.5 2.6 2.7 3  , highest: 7.7 7.9 8   8.4 8.5
## --------------------------------------------------------------------------------
## CompRes 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       45    0.999    5.442    1.388    3.595    3.900 
##      .25      .50      .75      .90      .95 
##    4.600    5.450    6.325    7.010    7.305 
## 
## lowest : 2.6 3   3.2 3.5 3.6, highest: 7.4 7.5 7.6 7.7 7.8
## --------------------------------------------------------------------------------
## Advertising 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       41    0.999     4.01    1.302    2.200    2.400 
##      .25      .50      .75      .90      .95 
##    3.175    4.000    4.800    5.510    5.800 
## 
## lowest : 1.9 2.1 2.2 2.3 2.4, highest: 5.7 5.8 5.9 6.3 6.5
## --------------------------------------------------------------------------------
## ProdLine 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       42    0.999    5.805    1.509    3.900    4.190 
##      .25      .50      .75      .90      .95 
##    4.700    5.750    6.800    7.600    7.805 
## 
## lowest : 2.3 2.9 3.3 3.6 3.9, highest: 7.7 7.8 7.9 8.3 8.4
## --------------------------------------------------------------------------------
## SalesFImage 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       35    0.997    5.123     1.19    3.385    3.790 
##      .25      .50      .75      .90      .95 
##    4.500    4.900    5.800    6.610    7.100 
## 
## lowest : 2.9 3   3.1 3.4 3.5, highest: 6.8 6.9 7.1 7.8 8.2
## --------------------------------------------------------------------------------
## ComPricing 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       45    0.998    6.974    1.778    4.500    4.800 
##      .25      .50      .75      .90      .95 
##    5.875    7.100    8.400    8.810    9.105 
## 
## lowest : 3.7 3.8 4.4 4.5 4.6, highest: 9.2 9.3 9.6 9.7 9.9
## --------------------------------------------------------------------------------
## WartyClaim 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       34    0.998    6.043   0.9372    4.795    5.000 
##      .25      .50      .75      .90      .95 
##    5.400    6.100    6.600    7.200    7.305 
## 
## lowest : 4.1 4.3 4.5 4.7 4.8, highest: 7.3 7.4 7.5 7.7 8.1
## --------------------------------------------------------------------------------
## OrdBilling 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       37    0.998    4.278    1.033    2.595    3.000 
##      .25      .50      .75      .90      .95 
##    3.700    4.400    4.800    5.400    5.605 
## 
## lowest : 2   2.1 2.4 2.5 2.6, highest: 5.5 5.6 5.7 6.5 6.7
## --------------------------------------------------------------------------------
## DelSpeed 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       30    0.997    3.886   0.8267    2.595    2.990 
##      .25      .50      .75      .90      .95 
##    3.400    3.900    4.425    4.710    4.900 
## 
## lowest : 1.6 2   2.4 2.5 2.6, highest: 4.7 4.8 4.9 5.2 5.5
## --------------------------------------------------------------------------------
## Satisfaction 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       29    0.997    6.918    1.371    5.190    5.400 
##      .25      .50      .75      .90      .95 
##    6.000    7.050    7.625    8.600    8.900 
## 
## lowest : 4.7 4.8 5   5.2 5.4, highest: 8.6 8.7 8.9 9   9.9
## --------------------------------------------------------------------------------

 data_X <- subset(data, select=-c(1))

Correlation Analysis

Inter-Item Correlation Matrix

datamatrix <- cor(data_X[-12]) # from library corrplot
corrplot(datamatrix, method="number")

corrplot(datamatrix, order="hclust", type='upper',tl.srt = 45)

These variables are highly correlated:: 1. CompRes and DelSpeed 2. OrdBilling and CompRes 3. WartyClaim and TechSupport 4. CompRes and OrdBilling 5. OrdBilling and DelSpeed 6. Ecom and SalesFImage

Correlation Coefficient

coeff <- pcor(data_X[-12], method="pearson") # from library ppcor

coeff$estimate

##                ProdQual        Ecom     TechSup     CompRes Advertising
## ProdQual     1.00000000 -0.06137387  0.04526368  0.06182758  0.10718506
## Ecom        -0.06137387  1.00000000  0.06805570 -0.09741963  0.01546781
## TechSup      0.04526368  0.06805570  1.00000000  0.15566994 -0.06193553
## CompRes      0.06182758 -0.09741963  0.15566994  1.00000000 -0.07373805
## Advertising  0.10718506  0.01546781 -0.06193553 -0.07373805  1.00000000
## ProdLine     0.50256298  0.10050431 -0.11741341  0.05359792 -0.14272350
## SalesFImage  0.04162563  0.72474230 -0.07590728  0.12393577  0.31079614
## ComPricing  -0.08486137  0.04664698 -0.13853368 -0.01994195 -0.05965306
## WartyClaim  -0.12211328 -0.09991399  0.78713506 -0.12737815  0.03173627
## OrdBilling   0.18447638  0.11302141 -0.15973237  0.32236273 -0.03983344
## DelSpeed    -0.35476917 -0.04045236 -0.01707376  0.55487929  0.20164019
##                ProdLine SalesFImage  ComPricing  WartyClaim  OrdBilling
## ProdQual     0.50256298  0.04162563 -0.08486137 -0.12211328  0.18447638
## Ecom         0.10050431  0.72474230  0.04664698 -0.09991399  0.11302141
## TechSup     -0.11741341 -0.07590728 -0.13853368  0.78713506 -0.15973237
## CompRes      0.05359792  0.12393577 -0.01994195 -0.12737815  0.32236273
## Advertising -0.14272350  0.31079614 -0.05965306  0.03173627 -0.03983344
## ProdLine     1.00000000 -0.14787285 -0.38577264  0.24605237 -0.26098863
## SalesFImage -0.14787285  1.00000000  0.09204079  0.17477777 -0.11325620
## ComPricing  -0.38577264  0.09204079  1.00000000  0.02832801 -0.10102366
## WartyClaim   0.24605237  0.17477777  0.02832801  1.00000000  0.25041217
## OrdBilling  -0.26098863 -0.11325620 -0.10102366  0.25041217  1.00000000
## DelSpeed     0.52936161  0.08692144  0.18404681 -0.10038822  0.36943703
##                DelSpeed
## ProdQual    -0.35476917
## Ecom        -0.04045236
## TechSup     -0.01707376
## CompRes      0.55487929
## Advertising  0.20164019
## ProdLine     0.52936161
## SalesFImage  0.08692144
## ComPricing   0.18404681
## WartyClaim  -0.10038822
## OrdBilling   0.36943703
## DelSpeed     1.00000000

coeff$p.value

##                 ProdQual         Ecom      TechSup      CompRes Advertising
## ProdQual    0.000000e+00 5.633173e-01 6.700805e-01 5.604309e-01 0.311890776
## Ecom        5.633173e-01 0.000000e+00 5.215362e-01 3.582709e-01 0.884298394
## TechSup     6.700805e-01 5.215362e-01 0.000000e+00 1.406252e-01 0.559745101
## CompRes     5.604309e-01 3.582709e-01 1.406252e-01 0.000000e+00 0.487281744
## Advertising 3.118908e-01 8.842984e-01 5.597451e-01 4.872817e-01 0.000000000
## ProdLine    3.851133e-07 3.431808e-01 2.676856e-01 6.138493e-01 0.177146062
## SalesFImage 6.952327e-01 4.615957e-16 4.745275e-01 2.418211e-01 0.002713553
## ComPricing  4.238387e-01 6.606097e-01 1.903370e-01 8.511723e-01 0.574328622
## WartyClaim  2.488675e-01 3.460372e-01 2.232181e-20 2.288909e-01 0.765218696
## OrdBilling  8.002810e-02 2.861193e-01 1.304285e-01 1.831537e-03 0.707747645
## DelSpeed    5.596498e-04 7.034167e-01 8.723821e-01 1.146684e-08 0.055279741
##                 ProdLine  SalesFImage   ComPricing   WartyClaim   OrdBilling
## ProdQual    3.851133e-07 6.952327e-01 0.4238387206 2.488675e-01 0.0800281045
## Ecom        3.431808e-01 4.615957e-16 0.6606096903 3.460372e-01 0.2861193395
## TechSup     2.676856e-01 4.745275e-01 0.1903369540 2.232181e-20 0.1304284879
## CompRes     6.138493e-01 2.418211e-01 0.8511722571 2.288909e-01 0.0018315375
## Advertising 1.771461e-01 2.713553e-03 0.5743286224 7.652187e-01 0.7077476451
## ProdLine    0.000000e+00 1.618670e-01 0.0001590844 1.872256e-02 0.0124641303
## SalesFImage 1.618670e-01 0.000000e+00 0.3855479040 9.751825e-02 0.2851130484
## ComPricing  1.590844e-04 3.855479e-01 0.0000000000 7.898158e-01 0.3406799281
## WartyClaim  1.872256e-02 9.751825e-02 0.7898158257 0.000000e+00 0.0166642142
## OrdBilling  1.246413e-02 2.851130e-01 0.3406799281 1.666421e-02 0.0000000000
## DelSpeed    6.855711e-08 4.126338e-01 0.0807457060 3.437413e-01 0.0003135242
##                 DelSpeed
## ProdQual    5.596498e-04
## Ecom        7.034167e-01
## TechSup     8.723821e-01
## CompRes     1.146684e-08
## Advertising 5.527974e-02
## ProdLine    6.855711e-08
## SalesFImage 4.126338e-01
## ComPricing  8.074571e-02
## WartyClaim  3.437413e-01
## OrdBilling  3.135242e-04
## DelSpeed    0.000000e+00

corrplot(coeff$estimate, type="upper", order="hclust",
         p.mat = coeff$p.value, sig.level = 0.01, insig = "blank")

From the plot: - The correlation between sales force image (SalesFImage) and e-commerce (Ecom) is highly significant (dark blue icon) - The correlation between delivery speed (DelSpeed) and order billing (OrdBilling) and others… We can assume that there is a high degree of collinearity between the independent variables.

VIF

Multicollinearity can be statistically assessed by computing a score called the variance inflation factor (or VIF), which measures how much the variance of a regression coefficient is inflated due to multicollinearity in the model Fist, build a regression model with all predictor variables. Then run vif() from car library. VIF value is considered high when it is above 5 or 10.

model <- lm(Satisfaction ~., data = data_X)
vif(model)

##    ProdQual        Ecom     TechSup     CompRes Advertising    ProdLine 
##    1.635797    2.756694    2.976796    4.730448    1.508933    3.488185 
## SalesFImage  ComPricing  WartyClaim  OrdBilling    DelSpeed 
##    3.439420    1.635000    3.198337    2.902999    6.516014

DelSpeed has high VIF value. The linear regression model assumption will be violated.

Two of the most commonly used methods to deal with multicollinearity: - Remove highly correlated variables using VIF or stepwise algorithms - Perform an analysis design like principal component analysis (PCA)/ Factor Analysis on the correlated variables

Factor Analysis Test KMO

Note - exclude your Y (dependent variable = predicted = outcome).

KMO (Kaiser-Meyer-Olkin) takes values between 0 and 1. A value greater than 0.5 indicates a fit for factor analysis.

data_fa <- data_X[,-12]
datamatrix <- cor(data_fa)
KMO(r=datamatrix)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = datamatrix)
## Overall MSA =  0.65
## MSA for each item = 
##    ProdQual        Ecom     TechSup     CompRes Advertising    ProdLine 
##        0.51        0.63        0.52        0.79        0.78        0.62 
## SalesFImage  ComPricing  WartyClaim  OrdBilling    DelSpeed 
##        0.62        0.75        0.51        0.76        0.67

Since MSA > 0.5, we can run Factor Analysis on this data.

Bartlett Test

Bartlett’s Test measures the sampling adequacy and also determines the appropriateness of Factor Analysis. The approximate of Chi-square is 619 with 55 degrees of freedom, which is significant at 0.05 Level of significance.

cortest.bartlett(datamatrix, nrow(data_X))

## $chisq
## [1] 619.2726
## 
## $p.value
## [1] 1.79337e-96
## 
## $df
## [1] 55

Factor Analysis

Scree plot

To determine the number of factors for FActor Analysis is to examine the scree plot of the eigenvalues. Sharp breaks in the plot suggest the appropriate number of components or factors.

ev <- eigen(cor(data_fa))
ev$values

##  [1] 3.42697133 2.55089671 1.69097648 1.08655606 0.60942409 0.55188378
##  [7] 0.40151815 0.24695154 0.20355327 0.13284158 0.09842702

Factor = c(1,2,3,4,5,6,7,8,9,10,11)
Eigen_Values <-ev$values
Scree <- data.frame(Factor, Eigen_Values)
plot(Scree, main = "Scree Plot", col= "Blue",ylim=c(0,4))
lines(Scree,col='Red')
abline(h = 1, col="Green")

Varimax Rotation

nfactors <- 4
fit1 <-factanal(data_fa,nfactors,scores = c("regression"),rotation = "varimax")
print(fit1)

## 
## Call:
## factanal(x = data_fa, factors = nfactors, scores = c("regression"),     rotation = "varimax")
## 
## Uniquenesses:
##    ProdQual        Ecom     TechSup     CompRes Advertising    ProdLine 
##       0.682       0.360       0.228       0.178       0.679       0.005 
## SalesFImage  ComPricing  WartyClaim  OrdBilling    DelSpeed 
##       0.017       0.636       0.163       0.347       0.076 
## 
## Loadings:
##             Factor1 Factor2 Factor3 Factor4
## ProdQual                             0.557 
## Ecom                 0.793                 
## TechSup                      0.872   0.102 
## CompRes      0.884   0.142           0.135 
## Advertising  0.190   0.521          -0.110 
## ProdLine     0.502           0.104   0.856 
## SalesFImage  0.119   0.974          -0.130 
## ComPricing           0.225  -0.216  -0.514 
## WartyClaim                   0.894   0.158 
## OrdBilling   0.794   0.101   0.105         
## DelSpeed     0.928   0.189           0.164 
## 
##                Factor1 Factor2 Factor3 Factor4
## SS loadings      2.592   1.977   1.638   1.423
## Proportion Var   0.236   0.180   0.149   0.129
## Cumulative Var   0.236   0.415   0.564   0.694
## 
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 24.26 on 17 degrees of freedom.
## The p-value is 0.113

Diagram

fa_var <-  fa(r=data_fa, nfactors = 4, rotate="varimax",fm="pa")
fa.diagram(fa_var)

head(fa_var$scores)

##             PA1        PA2          PA3         PA4
## [1,] -0.1338871  0.9175166 -1.719604873  0.09135411
## [2,]  1.6297604 -2.0090053 -0.596361722  0.65808192
## [3,]  0.3637658  0.8361736  0.002979966  1.37548765
## [4,] -1.2225230 -0.5491336  1.245473305 -0.64421384
## [5,] -0.4854209 -0.4276223 -0.026980304  0.47360747
## [6,] -0.5950924 -1.3035333 -1.183019401 -0.95913571

Regression

Factors can be relabeled based on your knowledge of data

regdata <- cbind(data_X[12], fa_var$scores)
#Labeling the data

names(regdata) <- c("Satisfaction", "Purchase", "Marketing",
                    "Post_purchase", "Prod_positioning")
head(regdata)

##   Satisfaction   Purchase  Marketing Post_purchase Prod_positioning
## 1          8.2 -0.1338871  0.9175166  -1.719604873       0.09135411
## 2          5.7  1.6297604 -2.0090053  -0.596361722       0.65808192
## 3          8.9  0.3637658  0.8361736   0.002979966       1.37548765
## 4          4.8 -1.2225230 -0.5491336   1.245473305      -0.64421384
## 5          7.1 -0.4854209 -0.4276223  -0.026980304       0.47360747
## 6          4.7 -0.5950924 -1.3035333  -1.183019401      -0.95913571

set.seed(100)
indices= sample(1:nrow(regdata), 0.7*nrow(regdata))
train=regdata[indices,]
test = regdata[-indices,]

model1 = lm(Satisfaction~., train)
summary(model1)

## 
## Call:
## lm(formula = Satisfaction ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.76280 -0.48717  0.06799  0.46459  1.24022 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       6.91852    0.08068  85.757  < 2e-16 ***
## Purchase          0.50230    0.07641   6.574 9.74e-09 ***
## Marketing         0.75488    0.08390   8.998 5.00e-13 ***
## Post_purchase     0.08755    0.08216   1.066    0.291    
## Prod_positioning  0.58074    0.08781   6.614 8.30e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6629 on 65 degrees of freedom
## Multiple R-squared:  0.7261, Adjusted R-squared:  0.7093 
## F-statistic: 43.08 on 4 and 65 DF,  p-value: < 2.2e-16

Testing VIF

print(vif(model1))

##         Purchase        Marketing    Post_purchase Prod_positioning 
##         1.009648         1.008235         1.015126         1.024050

Model Performance metrics

#Model 1:
pred_test1 <- predict(model1, newdata = test, type = "response")
pred_test1

##        6        8       11       13       17       19       26       33 
## 4.975008 5.908267 6.951629 8.677431 6.613838 6.963113 6.313513 6.141338 
##       34       35       37       40       42       44       49       50 
## 6.158993 7.415742 6.589746 6.858206 7.133989 8.533080 8.765145 8.078744 
##       53       56       57       60       65       67       71       73 
## 7.395438 7.468360 8.744402 6.276660 5.936570 6.650322 8.299545 7.685564 
##       75       80       96       97       99      100 
## 7.330191 6.719528 7.540233 6.143172 8.084583 5.799897

test$Satisfaction_Predicted <- pred_test1
head(test[c(1,6)], 10)

##    Satisfaction Satisfaction_Predicted
## 6           4.7               4.975008
## 8           6.3               5.908267
## 11          7.4               6.951629
## 13          8.4               8.677431
## 17          6.4               6.613838
## 19          6.8               6.963113
## 26          6.6               6.313513
## 33          5.4               6.141338
## 34          7.3               6.158993
## 35          6.3               7.415742

Week 5 Follow Along Practice Decision Trees

Kelvin Anderson

2023-09-25