## Factor analysis example 1: hair #####################################################
## Importing packages
library(corrplot) # used for making correlation plot.
## corrplot 0.84 loaded
library(tidyverse) # metapackage with lots of helpful functions
## -- Attaching packages ------------------------------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2) # used for plotting
library(psych) # used for Factor Analysis
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(caTools) # used for partitioning the data
## Reading in files
hair <- read.csv("C:/Users/ehk994/Desktop/Teaching/Research for Marketing Communications/9 - Factor analysis/Data_Factor_Hair_Revised.csv" , header = T)
## Name variables & explore data ############
## Saving variable names in a vector
variables <- c("Product Quality" , "E-Commerce" , "Technical Support" , "Complaint Resolution" ,
"Advertising" , "Product Line" , "Salesforce Image", "Competitive Pricing" ,
"Warranty & Claims" , "Order & Billing" , "Delivery Speed" , "Customer Satisfaction")
## Check the dimensions of the data
dim(hair)
## [1] 100 13
## find out names of the columns
names(hair)
## [1] "ID" "ProdQual" "Ecom" "TechSup"
## [5] "CompRes" "Advertising" "ProdLine" "SalesFImage"
## [9] "ComPricing" "WartyClaim" "OrdBilling" "DelSpeed"
## [13] "Satisfaction"
## Check the structure of the data
str(hair)
## 'data.frame': 100 obs. of 13 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ ProdQual : num 8.5 8.2 9.2 6.4 9 6.5 6.9 6.2 5.8 6.4 ...
## $ Ecom : num 3.9 2.7 3.4 3.3 3.4 2.8 3.7 3.3 3.6 4.5 ...
## $ TechSup : num 2.5 5.1 5.6 7 5.2 3.1 5 3.9 5.1 5.1 ...
## $ CompRes : num 5.9 7.2 5.6 3.7 4.6 4.1 2.6 4.8 6.7 6.1 ...
## $ Advertising : num 4.8 3.4 5.4 4.7 2.2 4 2.1 4.6 3.7 4.7 ...
## $ ProdLine : num 4.9 7.9 7.4 4.7 6 4.3 2.3 3.6 5.9 5.7 ...
## $ SalesFImage : num 6 3.1 5.8 4.5 4.5 3.7 5.4 5.1 5.8 5.7 ...
## $ ComPricing : num 6.8 5.3 4.5 8.8 6.8 8.5 8.9 6.9 9.3 8.4 ...
## $ WartyClaim : num 4.7 5.5 6.2 7 6.1 5.1 4.8 5.4 5.9 5.4 ...
## $ OrdBilling : num 5 3.9 5.4 4.3 4.5 3.6 2.1 4.3 4.4 4.1 ...
## $ DelSpeed : num 3.7 4.9 4.5 3 3.5 3.3 2 3.7 4.6 4.4 ...
## $ Satisfaction: num 8.2 5.7 8.9 4.8 7.1 4.7 5.7 6.3 7 5.5 ...
## Check summary of the data
summary(hair)
## ID ProdQual Ecom TechSup
## Min. : 1.00 Min. : 5.000 Min. :2.200 Min. :1.300
## 1st Qu.: 25.75 1st Qu.: 6.575 1st Qu.:3.275 1st Qu.:4.250
## Median : 50.50 Median : 8.000 Median :3.600 Median :5.400
## Mean : 50.50 Mean : 7.810 Mean :3.672 Mean :5.365
## 3rd Qu.: 75.25 3rd Qu.: 9.100 3rd Qu.:3.925 3rd Qu.:6.625
## Max. :100.00 Max. :10.000 Max. :5.700 Max. :8.500
## CompRes Advertising ProdLine SalesFImage
## Min. :2.600 Min. :1.900 Min. :2.300 Min. :2.900
## 1st Qu.:4.600 1st Qu.:3.175 1st Qu.:4.700 1st Qu.:4.500
## Median :5.450 Median :4.000 Median :5.750 Median :4.900
## Mean :5.442 Mean :4.010 Mean :5.805 Mean :5.123
## 3rd Qu.:6.325 3rd Qu.:4.800 3rd Qu.:6.800 3rd Qu.:5.800
## Max. :7.800 Max. :6.500 Max. :8.400 Max. :8.200
## ComPricing WartyClaim OrdBilling DelSpeed
## Min. :3.700 Min. :4.100 Min. :2.000 Min. :1.600
## 1st Qu.:5.875 1st Qu.:5.400 1st Qu.:3.700 1st Qu.:3.400
## Median :7.100 Median :6.100 Median :4.400 Median :3.900
## Mean :6.974 Mean :6.043 Mean :4.278 Mean :3.886
## 3rd Qu.:8.400 3rd Qu.:6.600 3rd Qu.:4.800 3rd Qu.:4.425
## Max. :9.900 Max. :8.100 Max. :6.700 Max. :5.500
## Satisfaction
## Min. :4.700
## 1st Qu.:6.000
## Median :7.050
## Mean :6.918
## 3rd Qu.:7.625
## Max. :9.900
# From summary and structure we learned that the given data is scaled already and no need to scale it again.
# We also learned that first column named "ID" is just a column number
# and we won't be needing in the for process.
# Create new data frame named hair1 with all data except the column "ID"
hair1 <- hair[,-1]
#scaled_hair <- scale(hair1) #(x-mean(x))/sd(x)
## Change names of variables
colnames(hair1) <- variables
## Attach Data
attach(hair1)
## Missing Values in Data
sum(is.na(hair1))
## [1] 0
## Some descriptive statistics ##################################
## Histogram of the Target Variable that is Customer Satisfaction
hist(`Customer Satisfaction`, breaks = c(0:11), labels = T,
include.lowest=T, right=T,
col=8, border=1,
main = paste("Histogram of Customer Satisfaction"),
xlab= "Customer Satisfaction", ylab="COUNT",
xlim = c(0,11), ylim = c(0,35))

## BoxPlot of the Target Variable that is Customer Satisfaction
boxplot(`Customer Satisfaction`, horizontal = T, xlab = variables[12], ylim=c(0,11))

## Histogram of the independent Variables
par(mfrow = c(3,4)) #Convert Plotting space in 12
for (i in (1:11)) {
h = round(max(hair1[,i]),0)+1
l = round(min(hair1[,i]),0)-1
n = variables[i]
hist (hair1[,i], breaks = seq(l,h,((h-l)/6)), labels = T,
include.lowest=T, right=T,
col=8, border=1,
main = NULL, xlab= n, ylab=NULL,
cex.lab=1, cex.axis=1, cex.main=1, cex.sub=1,
xlim = c(0,11), ylim = c(0,70))
}

## Explore data "hair1" using boxplot methods
par(mfrow = c(2,1))
boxplot(hair1[,-12], las = 2, names = variables[-12], cex.axis = 1)

## Create correlation matrix --> Input for a factor analysis
corlnMtrx <- cor(hair1[,-12]) # exclude y variable
corlnMtrx
## Product Quality E-Commerce Technical Support
## Product Quality 1.00000000 -0.1371632174 0.0956004542
## E-Commerce -0.13716322 1.0000000000 0.0008667887
## Technical Support 0.09560045 0.0008667887 1.0000000000
## Complaint Resolution 0.10637000 0.1401792611 0.0966565978
## Advertising -0.05347313 0.4298907110 -0.0628700668
## Product Line 0.47749341 -0.0526878383 0.1926254565
## Salesforce Image -0.15181287 0.7915437115 0.0169905395
## Competitive Pricing -0.40128188 0.2294624014 -0.2707866821
## Warranty & Claims 0.08831231 0.0518981915 0.7971679258
## Order & Billing 0.10430307 0.1561473316 0.0801018246
## Delivery Speed 0.02771800 0.1916360683 0.0254406935
## Complaint Resolution Advertising Product Line
## Product Quality 0.1063700 -0.05347313 0.47749341
## E-Commerce 0.1401793 0.42989071 -0.05268784
## Technical Support 0.0966566 -0.06287007 0.19262546
## Complaint Resolution 1.0000000 0.19691685 0.56141695
## Advertising 0.1969168 1.00000000 -0.01155082
## Product Line 0.5614170 -0.01155082 1.00000000
## Salesforce Image 0.2297518 0.54220366 -0.06131553
## Competitive Pricing -0.1279543 0.13421689 -0.49494840
## Warranty & Claims 0.1404083 0.01079207 0.27307753
## Order & Billing 0.7568686 0.18423559 0.42440825
## Delivery Speed 0.8650917 0.27586308 0.60185021
## Salesforce Image Competitive Pricing
## Product Quality -0.15181287 -0.40128188
## E-Commerce 0.79154371 0.22946240
## Technical Support 0.01699054 -0.27078668
## Complaint Resolution 0.22975176 -0.12795425
## Advertising 0.54220366 0.13421689
## Product Line -0.06131553 -0.49494840
## Salesforce Image 1.00000000 0.26459655
## Competitive Pricing 0.26459655 1.00000000
## Warranty & Claims 0.10745534 -0.24498605
## Order & Billing 0.19512741 -0.11456703
## Delivery Speed 0.27155126 -0.07287173
## Warranty & Claims Order & Billing Delivery Speed
## Product Quality 0.08831231 0.10430307 0.02771800
## E-Commerce 0.05189819 0.15614733 0.19163607
## Technical Support 0.79716793 0.08010182 0.02544069
## Complaint Resolution 0.14040830 0.75686859 0.86509170
## Advertising 0.01079207 0.18423559 0.27586308
## Product Line 0.27307753 0.42440825 0.60185021
## Salesforce Image 0.10745534 0.19512741 0.27155126
## Competitive Pricing -0.24498605 -0.11456703 -0.07287173
## Warranty & Claims 1.00000000 0.19706512 0.10939460
## Order & Billing 0.19706512 1.00000000 0.75100307
## Delivery Speed 0.10939460 0.75100307 1.00000000
### Factor analysis steps #########################
## 1) Initial extraction
# Initial run without rotation
#library(psych)
fit1 <- factanal(hair1[,-12], 6, rotation="none" )
print(fit1) # we can check eigenvalues with this intial run
##
## Call:
## factanal(x = hair1[, -12], factors = 6, rotation = "none")
##
## Uniquenesses:
## Product Quality E-Commerce Technical Support
## 0.440 0.364 0.005
## Complaint Resolution Advertising Product Line
## 0.203 0.666 0.005
## Salesforce Image Competitive Pricing Warranty & Claims
## 0.005 0.596 0.304
## Order & Billing Delivery Speed
## 0.005 0.015
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## Product Quality 0.264 -0.299 -0.118 0.260 -0.369 0.428
## E-Commerce 0.209 0.660 0.336 0.188
## Technical Support 0.316 -0.477 0.809 -0.112
## Complaint Resolution 0.816 0.123 -0.147 0.274
## Advertising 0.204 0.493 0.144 0.104
## Product Line 0.778 -0.357 -0.211 0.463
## Salesforce Image 0.272 0.822 0.433 0.238
## Competitive Pricing -0.298 0.464 -0.170 0.223 -0.143
## Warranty & Claims 0.400 -0.323 0.633 -0.137
## Order & Billing 0.844 0.183 -0.184 -0.455
## Delivery Speed 0.845 0.180 -0.205 0.443
##
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## SS loadings 3.275 2.199 1.533 0.644 0.493 0.247
## Proportion Var 0.298 0.200 0.139 0.059 0.045 0.022
## Cumulative Var 0.298 0.498 0.637 0.696 0.740 0.763
##
## Test of the hypothesis that 6 factors are sufficient.
## The chi square statistic is 4.04 on 4 degrees of freedom.
## The p-value is 0.401
# Initial run with rotation
#library(psych)
fit2 <- factanal(hair1[,-12], 6, rotation="varimax" )
print(fit2) # we can check eigenvalues with this intial run
##
## Call:
## factanal(x = hair1[, -12], factors = 6, rotation = "varimax")
##
## Uniquenesses:
## Product Quality E-Commerce Technical Support
## 0.440 0.364 0.005
## Complaint Resolution Advertising Product Line
## 0.203 0.666 0.005
## Salesforce Image Competitive Pricing Warranty & Claims
## 0.005 0.596 0.304
## Order & Billing Delivery Speed
## 0.005 0.015
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## Product Quality 0.739
## E-Commerce 0.782 -0.106
## Technical Support 0.987 0.102
## Complaint Resolution 0.865 0.158 0.137
## Advertising 0.189 0.530
## Product Line 0.542 0.141 0.678 -0.117 0.453
## Salesforce Image 0.983 -0.119
## Competitive Pricing 0.228 -0.219 -0.535
## Warranty & Claims 0.809 0.116 0.100
## Order & Billing 0.827 0.111 0.527
## Delivery Speed 0.961 0.184 -0.117 0.104
##
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## SS loadings 2.714 1.994 1.718 1.376 0.324 0.266
## Proportion Var 0.247 0.181 0.156 0.125 0.029 0.024
## Cumulative Var 0.247 0.428 0.584 0.709 0.739 0.763
##
## Test of the hypothesis that 6 factors are sufficient.
## The chi square statistic is 4.04 on 4 degrees of freedom.
## The p-value is 0.401
# Bartlett's test of sphericity
cortest.bartlett(corlnMtrx, 100)
## $chisq
## [1] 619.2726
##
## $p.value
## [1] 1.79337e-96
##
## $df
## [1] 55
# Bartlett’s test of sphericity is a test statistic used to examine the null hypothesis that the variables are uncorrelated in the population.
# Small values (less than 0.05) of the p-value indicate that a factor analysis may be useful with your data.
## 2) Determine number of factors to retain
# (i) scree plot
# Eigenvalues
A <- eigen(corlnMtrx)
EV <- A$values
# Use EV as an input to create a scree plot and add lines.
plot(EV, main = "Scree Plot", xlab = "Factors", ylab = "Eigen Values", pch = 20, col = "blue")
lines(EV, col = "red")
abline(h = 1, col = "green", lty = 2) # look at the bending point; works as a cut-off

# (ii) Kaiser Rule: Keep all factors with EV > 1
# (iii) Proportion of varianve: factors to explain 50% + of variance --> Look at 'Cumulative Var' in the output of the factor analysis
## 3) Varimax rotation
#library(psych)
fit3 <- factanal(hair1[,-12], 4, rotation="varimax")
print(fit3)
##
## Call:
## factanal(x = hair1[, -12], factors = 4, rotation = "varimax")
##
## Uniquenesses:
## Product Quality E-Commerce Technical Support
## 0.682 0.360 0.228
## Complaint Resolution Advertising Product Line
## 0.178 0.679 0.005
## Salesforce Image Competitive Pricing Warranty & Claims
## 0.017 0.636 0.163
## Order & Billing Delivery Speed
## 0.347 0.076
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4
## Product Quality 0.557
## E-Commerce 0.793
## Technical Support 0.872 0.102
## Complaint Resolution 0.884 0.142 0.135
## Advertising 0.190 0.521 -0.110
## Product Line 0.502 0.104 0.856
## Salesforce Image 0.119 0.974 -0.130
## Competitive Pricing 0.225 -0.216 -0.514
## Warranty & Claims 0.894 0.158
## Order & Billing 0.794 0.101 0.105
## Delivery Speed 0.928 0.189 0.164
##
## Factor1 Factor2 Factor3 Factor4
## SS loadings 2.592 1.977 1.638 1.423
## Proportion Var 0.236 0.180 0.149 0.129
## Cumulative Var 0.236 0.415 0.564 0.694
##
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 24.26 on 17 degrees of freedom.
## The p-value is 0.113
## 4) Interpret solution: Name main factors --> Functions/ Images / Technique and warranty / Product line
## 5) Calculate factor scores with a rotated method
fit4 <- factanal(hair1[,-12], 4, rotation="varimax", scores = "regression")
print(fit4)
##
## Call:
## factanal(x = hair1[, -12], factors = 4, scores = "regression", rotation = "varimax")
##
## Uniquenesses:
## Product Quality E-Commerce Technical Support
## 0.682 0.360 0.228
## Complaint Resolution Advertising Product Line
## 0.178 0.679 0.005
## Salesforce Image Competitive Pricing Warranty & Claims
## 0.017 0.636 0.163
## Order & Billing Delivery Speed
## 0.347 0.076
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4
## Product Quality 0.557
## E-Commerce 0.793
## Technical Support 0.872 0.102
## Complaint Resolution 0.884 0.142 0.135
## Advertising 0.190 0.521 -0.110
## Product Line 0.502 0.104 0.856
## Salesforce Image 0.119 0.974 -0.130
## Competitive Pricing 0.225 -0.216 -0.514
## Warranty & Claims 0.894 0.158
## Order & Billing 0.794 0.101 0.105
## Delivery Speed 0.928 0.189 0.164
##
## Factor1 Factor2 Factor3 Factor4
## SS loadings 2.592 1.977 1.638 1.423
## Proportion Var 0.236 0.180 0.149 0.129
## Cumulative Var 0.236 0.415 0.564 0.694
##
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 24.26 on 17 degrees of freedom.
## The p-value is 0.113
# Save factor scores for further analysis
factor_scores <- as.data.frame(fit4$scores)
## Run a regression model with factors ##########################
## Create a data set
hair_factor <- cbind(hair, factor_scores) #factor scores work as new X variables instead of original X variables
hair_regression <- lm(Satisfaction ~ Factor1 + Factor2 + Factor3 + Factor4, data = hair_factor)
summary(hair_regression)
##
## Call:
## lm(formula = Satisfaction ~ Factor1 + Factor2 + Factor3 + Factor4,
## data = hair_factor)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.00581 -0.57872 0.04176 0.54492 1.72415
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.91800 0.07632 90.645 < 2e-16 ***
## Factor1 0.57483 0.07952 7.229 1.22e-10 ***
## Factor2 0.58816 0.07758 7.581 2.27e-11 ***
## Factor3 0.08588 0.08136 1.056 0.294
## Factor4 0.43295 0.07812 5.542 2.66e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7632 on 95 degrees of freedom
## Multiple R-squared: 0.6065, Adjusted R-squared: 0.59
## F-statistic: 36.61 on 4 and 95 DF, p-value: < 2.2e-16