options(repos = "https://cran.rstudio.com/")
library(datasets)
library(mlbench)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data("Sonar")
library("rpart")
m <- rpart(Class ~ ., data = Sonar,
method = "class")
library("rpart.plot")
rpart.plot(m)
p <- predict(m, Sonar, type = "class")
table(p, Sonar$Class)
##
## p M R
## M 95 10
## R 16 87
set.seed(12)
model <- train(Class ~ .,
data = Sonar,
method = "ranger")
print(model)
## Random Forest
##
## 208 samples
## 60 predictor
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 208, 208, 208, 208, 208, 208, ...
## Resampling results across tuning parameters:
##
## mtry splitrule Accuracy Kappa
## 2 gini 0.8078699 0.6108857
## 2 extratrees 0.8188579 0.6341083
## 31 gini 0.7751013 0.5455448
## 31 extratrees 0.8275348 0.6496807
## 60 gini 0.7676651 0.5302055
## 60 extratrees 0.8221403 0.6387246
##
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 31, splitrule = extratrees
## and min.node.size = 1.
plot(model)
model <- train(Class ~ .,
data = Sonar,
method = "ranger",
tuneLength = 5)
set.seed(42)
myGrid <- expand.grid(mtry = c(5, 10, 20, 40, 60),
splitrule = c("gini", "extratrees"),
min.node.size = 1) ## Minimal node size; default 1 for classification
model <- train(Class ~ .,
data = Sonar,
method = "ranger",
tuneGrid = myGrid,
trControl = trainControl(method = "cv",
number = 5,
verboseIter = FALSE))
print(model)
## Random Forest
##
## 208 samples
## 60 predictor
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 166, 167, 167, 167, 165
## Resampling results across tuning parameters:
##
## mtry splitrule Accuracy Kappa
## 5 gini 0.8173838 0.6297111
## 5 extratrees 0.8273668 0.6497566
## 10 gini 0.8171569 0.6305735
## 10 extratrees 0.8414310 0.6791146
## 20 gini 0.7978716 0.5908457
## 20 extratrees 0.8417740 0.6792866
## 40 gini 0.7879994 0.5727742
## 40 extratrees 0.8466467 0.6897164
## 60 gini 0.7928774 0.5822691
## 60 extratrees 0.8608271 0.7187262
##
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 60, splitrule = extratrees
## and min.node.size = 1.
plot(model)
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
library(MASS)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks randomForest::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(parsnip)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
##
## The following object is masked from 'package:parsnip':
##
## fit
##
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
##
## The following object is masked from 'package:stringr':
##
## boundary
##
##
## Attaching package: 'party'
##
## The following object is masked from 'package:dplyr':
##
## where
chocolate <- read_csv("chocolate_tibble-3.csv")
## Rows: 1795 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): company_location, bean_type, broad_bean_origin
## dbl (3): final_grade, review_date, cocoa_percent
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(chocolate) <- make.names(names(chocolate)) # normalizes columns names
kable(head(chocolate,5), "html", caption="Table 1. Top 5 columns") %>%
kable_styling("striped")
| final_grade | review_date | cocoa_percent | company_location | bean_type | broad_bean_origin |
|---|---|---|---|---|---|
| 3.75 | 2016 | 0.63 | France | NA | Sao Tome |
| 2.75 | 2015 | 0.70 | France | NA | Togo |
| 3.00 | 2015 | 0.70 | France | NA | Togo |
| 3.50 | 2015 | 0.70 | France | NA | Togo |
| 3.50 | 2015 | 0.70 | France | NA | Peru |
set.seed(3456)
trainIndex <- createDataPartition(chocolate$final_grade, p = .8,
list = FALSE)
chocolate_train <- chocolate[ trainIndex,]
chocolate_test <- chocolate[-trainIndex,]
spec <- decision_tree() %>%
set_mode("regression") %>%
set_engine("rpart")
print(spec)
## Decision Tree Model Specification (regression)
##
## Computational engine: rpart
model <- spec %>%
parsnip::fit(formula = final_grade ~ .,data = chocolate_train)
model2 <- rpart(final_grade ~ cocoa_percent + company_location, data=chocolate_train, method="anova")
model3 <- ctree(
final_grade ~ cocoa_percent + as.factor(company_location),
data = chocolate_train)
rpart.plot(model2, box.palette="RdBu", shadow.col="gray", nn=TRUE)
plot(model3)
decision_tree(tree_depth = 1) %>%
set_mode("regression") %>%
set_engine("rpart") %>%
parsnip::fit(formula = final_grade ~ .,
data = chocolate_train)
## parsnip model object
##
## n= 1438
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1438 321.2239 3.183936
## 2) cocoa_percent>=0.885 31 11.5000 2.500000 *
## 3) cocoa_percent< 0.885 1407 294.9036 3.199005 *
library(MASS)
data(package="MASS")
boston<-Boston
dim(boston)
## [1] 506 14
names(boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
train=sample(1:nrow(Boston),300)
Boston.rf=randomForest(medv ~ . , data = Boston , subset = train)
Plotting the Error vs Number of Trees Graph shows how the error is dropping as we are adding more trees and average them.
plot(Boston.rf)
Evaluating variable importance
importance(Boston.rf)
## IncNodePurity
## crim 1167.57012
## zn 105.68616
## indus 1382.61992
## chas 76.04384
## nox 1448.30458
## rm 6039.15927
## age 616.75173
## dis 1167.19886
## rad 186.45117
## tax 874.09457
## ptratio 1697.38179
## black 480.12419
## lstat 6329.55370
varImpPlot(Boston.rf)
library(readr)
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:randomForest':
##
## outlier
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(tidyverse)
library(corrplot)
## corrplot 0.92 loaded
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
##
## describe
## The following object is masked from 'package:parsnip':
##
## translate
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:modeltools':
##
## Predict
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(ppcor)
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
data <- read_csv("Factor-Hair-Revised.csv")
## Rows: 100 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (13): ID, ProdQual, Ecom, TechSup, CompRes, Advertising, ProdLine, Sales...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(data)
## [1] 100 13
str(data)
## spc_tbl_ [100 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ID : num [1:100] 1 2 3 4 5 6 7 8 9 10 ...
## $ ProdQual : num [1:100] 8.5 8.2 9.2 6.4 9 6.5 6.9 6.2 5.8 6.4 ...
## $ Ecom : num [1:100] 3.9 2.7 3.4 3.3 3.4 2.8 3.7 3.3 3.6 4.5 ...
## $ TechSup : num [1:100] 2.5 5.1 5.6 7 5.2 3.1 5 3.9 5.1 5.1 ...
## $ CompRes : num [1:100] 5.9 7.2 5.6 3.7 4.6 4.1 2.6 4.8 6.7 6.1 ...
## $ Advertising : num [1:100] 4.8 3.4 5.4 4.7 2.2 4 2.1 4.6 3.7 4.7 ...
## $ ProdLine : num [1:100] 4.9 7.9 7.4 4.7 6 4.3 2.3 3.6 5.9 5.7 ...
## $ SalesFImage : num [1:100] 6 3.1 5.8 4.5 4.5 3.7 5.4 5.1 5.8 5.7 ...
## $ ComPricing : num [1:100] 6.8 5.3 4.5 8.8 6.8 8.5 8.9 6.9 9.3 8.4 ...
## $ WartyClaim : num [1:100] 4.7 5.5 6.2 7 6.1 5.1 4.8 5.4 5.9 5.4 ...
## $ OrdBilling : num [1:100] 5 3.9 5.4 4.3 4.5 3.6 2.1 4.3 4.4 4.1 ...
## $ DelSpeed : num [1:100] 3.7 4.9 4.5 3 3.5 3.3 2 3.7 4.6 4.4 ...
## $ Satisfaction: num [1:100] 8.2 5.7 8.9 4.8 7.1 4.7 5.7 6.3 7 5.5 ...
## - attr(*, "spec")=
## .. cols(
## .. ID = col_double(),
## .. ProdQual = col_double(),
## .. Ecom = col_double(),
## .. TechSup = col_double(),
## .. CompRes = col_double(),
## .. Advertising = col_double(),
## .. ProdLine = col_double(),
## .. SalesFImage = col_double(),
## .. ComPricing = col_double(),
## .. WartyClaim = col_double(),
## .. OrdBilling = col_double(),
## .. DelSpeed = col_double(),
## .. Satisfaction = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
names(data)
## [1] "ID" "ProdQual" "Ecom" "TechSup" "CompRes"
## [6] "Advertising" "ProdLine" "SalesFImage" "ComPricing" "WartyClaim"
## [11] "OrdBilling" "DelSpeed" "Satisfaction"
describe(data)
## data
##
## 13 Variables 100 Observations
## --------------------------------------------------------------------------------
## ID
## n missing distinct Info Mean Gmd .05 .10
## 100 0 100 1 50.5 33.67 5.95 10.90
## .25 .50 .75 .90 .95
## 25.75 50.50 75.25 90.10 95.05
##
## lowest : 1 2 3 4 5, highest: 96 97 98 99 100
## --------------------------------------------------------------------------------
## ProdQual
## n missing distinct Info Mean Gmd .05 .10
## 100 0 43 0.999 7.81 1.61 5.595 5.790
## .25 .50 .75 .90 .95
## 6.575 8.000 9.100 9.410 9.900
##
## lowest : 5 5.1 5.2 5.5 5.6, highest: 9.4 9.5 9.6 9.9 10
## --------------------------------------------------------------------------------
## Ecom
## n missing distinct Info Mean Gmd .05 .10
## 100 0 27 0.996 3.672 0.7674 2.595 2.800
## .25 .50 .75 .90 .95
## 3.275 3.600 3.925 4.530 5.100
##
## lowest : 2.2 2.4 2.5 2.6 2.7, highest: 4.9 5.1 5.5 5.6 5.7
## --------------------------------------------------------------------------------
## TechSup
## n missing distinct Info Mean Gmd .05 .10
## 100 0 50 0.999 5.365 1.755 2.700 3.280
## .25 .50 .75 .90 .95
## 4.250 5.400 6.625 7.210 7.605
##
## lowest : 1.3 2.5 2.6 2.7 3 , highest: 7.7 7.9 8 8.4 8.5
## --------------------------------------------------------------------------------
## CompRes
## n missing distinct Info Mean Gmd .05 .10
## 100 0 45 0.999 5.442 1.388 3.595 3.900
## .25 .50 .75 .90 .95
## 4.600 5.450 6.325 7.010 7.305
##
## lowest : 2.6 3 3.2 3.5 3.6, highest: 7.4 7.5 7.6 7.7 7.8
## --------------------------------------------------------------------------------
## Advertising
## n missing distinct Info Mean Gmd .05 .10
## 100 0 41 0.999 4.01 1.302 2.200 2.400
## .25 .50 .75 .90 .95
## 3.175 4.000 4.800 5.510 5.800
##
## lowest : 1.9 2.1 2.2 2.3 2.4, highest: 5.7 5.8 5.9 6.3 6.5
## --------------------------------------------------------------------------------
## ProdLine
## n missing distinct Info Mean Gmd .05 .10
## 100 0 42 0.999 5.805 1.509 3.900 4.190
## .25 .50 .75 .90 .95
## 4.700 5.750 6.800 7.600 7.805
##
## lowest : 2.3 2.9 3.3 3.6 3.9, highest: 7.7 7.8 7.9 8.3 8.4
## --------------------------------------------------------------------------------
## SalesFImage
## n missing distinct Info Mean Gmd .05 .10
## 100 0 35 0.997 5.123 1.19 3.385 3.790
## .25 .50 .75 .90 .95
## 4.500 4.900 5.800 6.610 7.100
##
## lowest : 2.9 3 3.1 3.4 3.5, highest: 6.8 6.9 7.1 7.8 8.2
## --------------------------------------------------------------------------------
## ComPricing
## n missing distinct Info Mean Gmd .05 .10
## 100 0 45 0.998 6.974 1.778 4.500 4.800
## .25 .50 .75 .90 .95
## 5.875 7.100 8.400 8.810 9.105
##
## lowest : 3.7 3.8 4.4 4.5 4.6, highest: 9.2 9.3 9.6 9.7 9.9
## --------------------------------------------------------------------------------
## WartyClaim
## n missing distinct Info Mean Gmd .05 .10
## 100 0 34 0.998 6.043 0.9372 4.795 5.000
## .25 .50 .75 .90 .95
## 5.400 6.100 6.600 7.200 7.305
##
## lowest : 4.1 4.3 4.5 4.7 4.8, highest: 7.3 7.4 7.5 7.7 8.1
## --------------------------------------------------------------------------------
## OrdBilling
## n missing distinct Info Mean Gmd .05 .10
## 100 0 37 0.998 4.278 1.033 2.595 3.000
## .25 .50 .75 .90 .95
## 3.700 4.400 4.800 5.400 5.605
##
## lowest : 2 2.1 2.4 2.5 2.6, highest: 5.5 5.6 5.7 6.5 6.7
## --------------------------------------------------------------------------------
## DelSpeed
## n missing distinct Info Mean Gmd .05 .10
## 100 0 30 0.997 3.886 0.8267 2.595 2.990
## .25 .50 .75 .90 .95
## 3.400 3.900 4.425 4.710 4.900
##
## lowest : 1.6 2 2.4 2.5 2.6, highest: 4.7 4.8 4.9 5.2 5.5
## --------------------------------------------------------------------------------
## Satisfaction
## n missing distinct Info Mean Gmd .05 .10
## 100 0 29 0.997 6.918 1.371 5.190 5.400
## .25 .50 .75 .90 .95
## 6.000 7.050 7.625 8.600 8.900
##
## lowest : 4.7 4.8 5 5.2 5.4, highest: 8.6 8.7 8.9 9 9.9
## --------------------------------------------------------------------------------
data_X <- subset(data, select=-c(1))
datamatrix <- cor(data_X[-12]) # from library corrplot
corrplot(datamatrix, method="number")
corrplot(datamatrix, order="hclust", type='upper',tl.srt = 45)
These variables are highly correlated:: 1. CompRes and DelSpeed 2.
OrdBilling and CompRes 3. WartyClaim and TechSupport 4. CompRes and
OrdBilling 5. OrdBilling and DelSpeed 6. Ecom and SalesFImage
coeff <- pcor(data_X[-12], method="pearson") # from library ppcor
coeff$estimate
## ProdQual Ecom TechSup CompRes Advertising
## ProdQual 1.00000000 -0.06137387 0.04526368 0.06182758 0.10718506
## Ecom -0.06137387 1.00000000 0.06805570 -0.09741963 0.01546781
## TechSup 0.04526368 0.06805570 1.00000000 0.15566994 -0.06193553
## CompRes 0.06182758 -0.09741963 0.15566994 1.00000000 -0.07373805
## Advertising 0.10718506 0.01546781 -0.06193553 -0.07373805 1.00000000
## ProdLine 0.50256298 0.10050431 -0.11741341 0.05359792 -0.14272350
## SalesFImage 0.04162563 0.72474230 -0.07590728 0.12393577 0.31079614
## ComPricing -0.08486137 0.04664698 -0.13853368 -0.01994195 -0.05965306
## WartyClaim -0.12211328 -0.09991399 0.78713506 -0.12737815 0.03173627
## OrdBilling 0.18447638 0.11302141 -0.15973237 0.32236273 -0.03983344
## DelSpeed -0.35476917 -0.04045236 -0.01707376 0.55487929 0.20164019
## ProdLine SalesFImage ComPricing WartyClaim OrdBilling
## ProdQual 0.50256298 0.04162563 -0.08486137 -0.12211328 0.18447638
## Ecom 0.10050431 0.72474230 0.04664698 -0.09991399 0.11302141
## TechSup -0.11741341 -0.07590728 -0.13853368 0.78713506 -0.15973237
## CompRes 0.05359792 0.12393577 -0.01994195 -0.12737815 0.32236273
## Advertising -0.14272350 0.31079614 -0.05965306 0.03173627 -0.03983344
## ProdLine 1.00000000 -0.14787285 -0.38577264 0.24605237 -0.26098863
## SalesFImage -0.14787285 1.00000000 0.09204079 0.17477777 -0.11325620
## ComPricing -0.38577264 0.09204079 1.00000000 0.02832801 -0.10102366
## WartyClaim 0.24605237 0.17477777 0.02832801 1.00000000 0.25041217
## OrdBilling -0.26098863 -0.11325620 -0.10102366 0.25041217 1.00000000
## DelSpeed 0.52936161 0.08692144 0.18404681 -0.10038822 0.36943703
## DelSpeed
## ProdQual -0.35476917
## Ecom -0.04045236
## TechSup -0.01707376
## CompRes 0.55487929
## Advertising 0.20164019
## ProdLine 0.52936161
## SalesFImage 0.08692144
## ComPricing 0.18404681
## WartyClaim -0.10038822
## OrdBilling 0.36943703
## DelSpeed 1.00000000
coeff$p.value
## ProdQual Ecom TechSup CompRes Advertising
## ProdQual 0.000000e+00 5.633173e-01 6.700805e-01 5.604309e-01 0.311890776
## Ecom 5.633173e-01 0.000000e+00 5.215362e-01 3.582709e-01 0.884298394
## TechSup 6.700805e-01 5.215362e-01 0.000000e+00 1.406252e-01 0.559745101
## CompRes 5.604309e-01 3.582709e-01 1.406252e-01 0.000000e+00 0.487281744
## Advertising 3.118908e-01 8.842984e-01 5.597451e-01 4.872817e-01 0.000000000
## ProdLine 3.851133e-07 3.431808e-01 2.676856e-01 6.138493e-01 0.177146062
## SalesFImage 6.952327e-01 4.615957e-16 4.745275e-01 2.418211e-01 0.002713553
## ComPricing 4.238387e-01 6.606097e-01 1.903370e-01 8.511723e-01 0.574328622
## WartyClaim 2.488675e-01 3.460372e-01 2.232181e-20 2.288909e-01 0.765218696
## OrdBilling 8.002810e-02 2.861193e-01 1.304285e-01 1.831537e-03 0.707747645
## DelSpeed 5.596498e-04 7.034167e-01 8.723821e-01 1.146684e-08 0.055279741
## ProdLine SalesFImage ComPricing WartyClaim OrdBilling
## ProdQual 3.851133e-07 6.952327e-01 0.4238387206 2.488675e-01 0.0800281045
## Ecom 3.431808e-01 4.615957e-16 0.6606096903 3.460372e-01 0.2861193395
## TechSup 2.676856e-01 4.745275e-01 0.1903369540 2.232181e-20 0.1304284879
## CompRes 6.138493e-01 2.418211e-01 0.8511722571 2.288909e-01 0.0018315375
## Advertising 1.771461e-01 2.713553e-03 0.5743286224 7.652187e-01 0.7077476451
## ProdLine 0.000000e+00 1.618670e-01 0.0001590844 1.872256e-02 0.0124641303
## SalesFImage 1.618670e-01 0.000000e+00 0.3855479040 9.751825e-02 0.2851130484
## ComPricing 1.590844e-04 3.855479e-01 0.0000000000 7.898158e-01 0.3406799281
## WartyClaim 1.872256e-02 9.751825e-02 0.7898158257 0.000000e+00 0.0166642142
## OrdBilling 1.246413e-02 2.851130e-01 0.3406799281 1.666421e-02 0.0000000000
## DelSpeed 6.855711e-08 4.126338e-01 0.0807457060 3.437413e-01 0.0003135242
## DelSpeed
## ProdQual 5.596498e-04
## Ecom 7.034167e-01
## TechSup 8.723821e-01
## CompRes 1.146684e-08
## Advertising 5.527974e-02
## ProdLine 6.855711e-08
## SalesFImage 4.126338e-01
## ComPricing 8.074571e-02
## WartyClaim 3.437413e-01
## OrdBilling 3.135242e-04
## DelSpeed 0.000000e+00
corrplot(coeff$estimate, type="upper", order="hclust",
p.mat = coeff$p.value, sig.level = 0.01, insig = "blank")
From the plot: - The correlation between sales force image (SalesFImage)
and e-commerce (Ecom) is highly significant (dark blue icon) - The
correlation between delivery speed (DelSpeed) and order billing
(OrdBilling) and others… We can assume that there is a high degree of
collinearity between the independent variables.
Multicollinearity can be statistically assessed by computing a score
called the variance inflation factor (or VIF), which
measures how much the variance of a regression coefficient is inflated
due to multicollinearity in the model Fist, build a regression model
with all predictor variables. Then run vif() from
car library. VIF value is considered high when it is above
5 or 10.
model <- lm(Satisfaction ~., data = data_X)
vif(model)
## ProdQual Ecom TechSup CompRes Advertising ProdLine
## 1.635797 2.756694 2.976796 4.730448 1.508933 3.488185
## SalesFImage ComPricing WartyClaim OrdBilling DelSpeed
## 3.439420 1.635000 3.198337 2.902999 6.516014
DelSpeed has high VIF value. The linear regression model assumption will be violated.
Two of the most commonly used methods to deal with multicollinearity: - Remove highly correlated variables using VIF or stepwise algorithms - Perform an analysis design like principal component analysis (PCA)/ Factor Analysis on the correlated variables
Note - exclude your Y (dependent variable = predicted = outcome).
KMO (Kaiser-Meyer-Olkin) takes values between 0 and 1. A value greater than 0.5 indicates a fit for factor analysis.
data_fa <- data_X[,-12]
datamatrix <- cor(data_fa)
KMO(r=datamatrix)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = datamatrix)
## Overall MSA = 0.65
## MSA for each item =
## ProdQual Ecom TechSup CompRes Advertising ProdLine
## 0.51 0.63 0.52 0.79 0.78 0.62
## SalesFImage ComPricing WartyClaim OrdBilling DelSpeed
## 0.62 0.75 0.51 0.76 0.67
Since MSA > 0.5, we can run Factor Analysis on this data.
Bartlett’s Test measures the sampling adequacy and also determines the appropriateness of Factor Analysis. The approximate of Chi-square is 619 with 55 degrees of freedom, which is significant at 0.05 Level of significance.
cortest.bartlett(datamatrix, nrow(data_X))
## $chisq
## [1] 619.2726
##
## $p.value
## [1] 1.79337e-96
##
## $df
## [1] 55
To determine the number of factors for FActor Analysis is to examine the scree plot of the eigenvalues. Sharp breaks in the plot suggest the appropriate number of components or factors.
ev <- eigen(cor(data_fa))
ev$values
## [1] 3.42697133 2.55089671 1.69097648 1.08655606 0.60942409 0.55188378
## [7] 0.40151815 0.24695154 0.20355327 0.13284158 0.09842702
Factor = c(1,2,3,4,5,6,7,8,9,10,11)
Eigen_Values <-ev$values
Scree <- data.frame(Factor, Eigen_Values)
plot(Scree, main = "Scree Plot", col= "Blue",ylim=c(0,4))
lines(Scree,col='Red')
abline(h = 1, col="Green")
nfactors <- 4
fit1 <-factanal(data_fa,nfactors,scores = c("regression"),rotation = "varimax")
print(fit1)
##
## Call:
## factanal(x = data_fa, factors = nfactors, scores = c("regression"), rotation = "varimax")
##
## Uniquenesses:
## ProdQual Ecom TechSup CompRes Advertising ProdLine
## 0.682 0.360 0.228 0.178 0.679 0.005
## SalesFImage ComPricing WartyClaim OrdBilling DelSpeed
## 0.017 0.636 0.163 0.347 0.076
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4
## ProdQual 0.557
## Ecom 0.793
## TechSup 0.872 0.102
## CompRes 0.884 0.142 0.135
## Advertising 0.190 0.521 -0.110
## ProdLine 0.502 0.104 0.856
## SalesFImage 0.119 0.974 -0.130
## ComPricing 0.225 -0.216 -0.514
## WartyClaim 0.894 0.158
## OrdBilling 0.794 0.101 0.105
## DelSpeed 0.928 0.189 0.164
##
## Factor1 Factor2 Factor3 Factor4
## SS loadings 2.592 1.977 1.638 1.423
## Proportion Var 0.236 0.180 0.149 0.129
## Cumulative Var 0.236 0.415 0.564 0.694
##
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 24.26 on 17 degrees of freedom.
## The p-value is 0.113
fa_var <- fa(r=data_fa, nfactors = 4, rotate="varimax",fm="pa")
fa.diagram(fa_var)
head(fa_var$scores)
## PA1 PA2 PA3 PA4
## [1,] -0.1338871 0.9175166 -1.719604873 0.09135411
## [2,] 1.6297604 -2.0090053 -0.596361722 0.65808192
## [3,] 0.3637658 0.8361736 0.002979966 1.37548765
## [4,] -1.2225230 -0.5491336 1.245473305 -0.64421384
## [5,] -0.4854209 -0.4276223 -0.026980304 0.47360747
## [6,] -0.5950924 -1.3035333 -1.183019401 -0.95913571
Factors can be relabeled based on your knowledge of data
regdata <- cbind(data_X[12], fa_var$scores)
#Labeling the data
names(regdata) <- c("Satisfaction", "Purchase", "Marketing",
"Post_purchase", "Prod_positioning")
head(regdata)
## Satisfaction Purchase Marketing Post_purchase Prod_positioning
## 1 8.2 -0.1338871 0.9175166 -1.719604873 0.09135411
## 2 5.7 1.6297604 -2.0090053 -0.596361722 0.65808192
## 3 8.9 0.3637658 0.8361736 0.002979966 1.37548765
## 4 4.8 -1.2225230 -0.5491336 1.245473305 -0.64421384
## 5 7.1 -0.4854209 -0.4276223 -0.026980304 0.47360747
## 6 4.7 -0.5950924 -1.3035333 -1.183019401 -0.95913571
set.seed(100)
indices= sample(1:nrow(regdata), 0.7*nrow(regdata))
train=regdata[indices,]
test = regdata[-indices,]
model1 = lm(Satisfaction~., train)
summary(model1)
##
## Call:
## lm(formula = Satisfaction ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.76280 -0.48717 0.06799 0.46459 1.24022
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.91852 0.08068 85.757 < 2e-16 ***
## Purchase 0.50230 0.07641 6.574 9.74e-09 ***
## Marketing 0.75488 0.08390 8.998 5.00e-13 ***
## Post_purchase 0.08755 0.08216 1.066 0.291
## Prod_positioning 0.58074 0.08781 6.614 8.30e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6629 on 65 degrees of freedom
## Multiple R-squared: 0.7261, Adjusted R-squared: 0.7093
## F-statistic: 43.08 on 4 and 65 DF, p-value: < 2.2e-16
print(vif(model1))
## Purchase Marketing Post_purchase Prod_positioning
## 1.009648 1.008235 1.015126 1.024050
#Model 1:
pred_test1 <- predict(model1, newdata = test, type = "response")
pred_test1
## 6 8 11 13 17 19 26 33
## 4.975008 5.908267 6.951629 8.677431 6.613838 6.963113 6.313513 6.141338
## 34 35 37 40 42 44 49 50
## 6.158993 7.415742 6.589746 6.858206 7.133989 8.533080 8.765145 8.078744
## 53 56 57 60 65 67 71 73
## 7.395438 7.468360 8.744402 6.276660 5.936570 6.650322 8.299545 7.685564
## 75 80 96 97 99 100
## 7.330191 6.719528 7.540233 6.143172 8.084583 5.799897
test$Satisfaction_Predicted <- pred_test1
head(test[c(1,6)], 10)
## Satisfaction Satisfaction_Predicted
## 6 4.7 4.975008
## 8 6.3 5.908267
## 11 7.4 6.951629
## 13 8.4 8.677431
## 17 6.4 6.613838
## 19 6.8 6.963113
## 26 6.6 6.313513
## 33 5.4 6.141338
## 34 7.3 6.158993
## 35 6.3 7.415742