## Factor analysis example 1: hair #####################################################
## Importing packages
library(corrplot)       # used for making correlation plot.
## corrplot 0.84 loaded
library(tidyverse)      # metapackage with lots of helpful functions
## -- Attaching packages ------------------------------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)        # used for plotting
library(psych)          # used for Factor Analysis
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(caTools)        # used for partitioning the data

## Reading in files
hair <- read.csv("C:/Users/ehk994/Desktop/Teaching/Research for Marketing Communications/9 - Factor analysis/Data_Factor_Hair_Revised.csv" , header = T)
## Name variables & explore data ############
## Saving variable names in a vector
variables <- c("Product Quality" , "E-Commerce" , "Technical Support" , "Complaint Resolution" , 
               "Advertising" , "Product Line" , "Salesforce Image", "Competitive Pricing" , 
               "Warranty & Claims" , "Order & Billing" , "Delivery Speed" , "Customer Satisfaction")
## Check the dimensions of the data
dim(hair)
## [1] 100  13
## find out names of the columns
names(hair)
##  [1] "ID"           "ProdQual"     "Ecom"         "TechSup"     
##  [5] "CompRes"      "Advertising"  "ProdLine"     "SalesFImage" 
##  [9] "ComPricing"   "WartyClaim"   "OrdBilling"   "DelSpeed"    
## [13] "Satisfaction"
## Check the structure of the data
str(hair)
## 'data.frame':    100 obs. of  13 variables:
##  $ ID          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ProdQual    : num  8.5 8.2 9.2 6.4 9 6.5 6.9 6.2 5.8 6.4 ...
##  $ Ecom        : num  3.9 2.7 3.4 3.3 3.4 2.8 3.7 3.3 3.6 4.5 ...
##  $ TechSup     : num  2.5 5.1 5.6 7 5.2 3.1 5 3.9 5.1 5.1 ...
##  $ CompRes     : num  5.9 7.2 5.6 3.7 4.6 4.1 2.6 4.8 6.7 6.1 ...
##  $ Advertising : num  4.8 3.4 5.4 4.7 2.2 4 2.1 4.6 3.7 4.7 ...
##  $ ProdLine    : num  4.9 7.9 7.4 4.7 6 4.3 2.3 3.6 5.9 5.7 ...
##  $ SalesFImage : num  6 3.1 5.8 4.5 4.5 3.7 5.4 5.1 5.8 5.7 ...
##  $ ComPricing  : num  6.8 5.3 4.5 8.8 6.8 8.5 8.9 6.9 9.3 8.4 ...
##  $ WartyClaim  : num  4.7 5.5 6.2 7 6.1 5.1 4.8 5.4 5.9 5.4 ...
##  $ OrdBilling  : num  5 3.9 5.4 4.3 4.5 3.6 2.1 4.3 4.4 4.1 ...
##  $ DelSpeed    : num  3.7 4.9 4.5 3 3.5 3.3 2 3.7 4.6 4.4 ...
##  $ Satisfaction: num  8.2 5.7 8.9 4.8 7.1 4.7 5.7 6.3 7 5.5 ...
## Check summary of the data 
summary(hair)
##        ID            ProdQual           Ecom          TechSup     
##  Min.   :  1.00   Min.   : 5.000   Min.   :2.200   Min.   :1.300  
##  1st Qu.: 25.75   1st Qu.: 6.575   1st Qu.:3.275   1st Qu.:4.250  
##  Median : 50.50   Median : 8.000   Median :3.600   Median :5.400  
##  Mean   : 50.50   Mean   : 7.810   Mean   :3.672   Mean   :5.365  
##  3rd Qu.: 75.25   3rd Qu.: 9.100   3rd Qu.:3.925   3rd Qu.:6.625  
##  Max.   :100.00   Max.   :10.000   Max.   :5.700   Max.   :8.500  
##     CompRes       Advertising       ProdLine      SalesFImage   
##  Min.   :2.600   Min.   :1.900   Min.   :2.300   Min.   :2.900  
##  1st Qu.:4.600   1st Qu.:3.175   1st Qu.:4.700   1st Qu.:4.500  
##  Median :5.450   Median :4.000   Median :5.750   Median :4.900  
##  Mean   :5.442   Mean   :4.010   Mean   :5.805   Mean   :5.123  
##  3rd Qu.:6.325   3rd Qu.:4.800   3rd Qu.:6.800   3rd Qu.:5.800  
##  Max.   :7.800   Max.   :6.500   Max.   :8.400   Max.   :8.200  
##    ComPricing      WartyClaim      OrdBilling       DelSpeed    
##  Min.   :3.700   Min.   :4.100   Min.   :2.000   Min.   :1.600  
##  1st Qu.:5.875   1st Qu.:5.400   1st Qu.:3.700   1st Qu.:3.400  
##  Median :7.100   Median :6.100   Median :4.400   Median :3.900  
##  Mean   :6.974   Mean   :6.043   Mean   :4.278   Mean   :3.886  
##  3rd Qu.:8.400   3rd Qu.:6.600   3rd Qu.:4.800   3rd Qu.:4.425  
##  Max.   :9.900   Max.   :8.100   Max.   :6.700   Max.   :5.500  
##   Satisfaction  
##  Min.   :4.700  
##  1st Qu.:6.000  
##  Median :7.050  
##  Mean   :6.918  
##  3rd Qu.:7.625  
##  Max.   :9.900
# From summary and structure we learned that the given data is scaled already and no need to scale it again.
# We also learned that first column named "ID" is just a column number 
# and we won't be needing in the for process.
# Create new data frame named hair1 with all data except the column "ID"
hair1 <- hair[,-1]
#scaled_hair <- scale(hair1) #(x-mean(x))/sd(x)
## Change names of variables 
colnames(hair1) <- variables
## Attach Data 
attach(hair1)
## Missing Values in Data
sum(is.na(hair1))
## [1] 0
## Some descriptive statistics ##################################
## Histogram of the Target Variable that is Customer Satisfaction
hist(`Customer Satisfaction`, breaks = c(0:11), labels = T,
      include.lowest=T, right=T, 
      col=8, border=1, 
      main = paste("Histogram of Customer Satisfaction"),
      xlab= "Customer Satisfaction", ylab="COUNT", 
      xlim = c(0,11), ylim = c(0,35))

## BoxPlot of the Target Variable that is Customer Satisfaction
boxplot(`Customer Satisfaction`, horizontal = T, xlab = variables[12], ylim=c(0,11))

## Histogram of the independent Variables
par(mfrow = c(3,4)) #Convert Plotting space in 12
for (i in (1:11)) {
  
  h = round(max(hair1[,i]),0)+1
  
  l = round(min(hair1[,i]),0)-1
  
  n = variables[i]
  
  hist (hair1[,i], breaks = seq(l,h,((h-l)/6)), labels = T,
        include.lowest=T, right=T, 
        col=8, border=1, 
        main = NULL, xlab= n, ylab=NULL, 
        cex.lab=1, cex.axis=1, cex.main=1, cex.sub=1,
        xlim = c(0,11), ylim = c(0,70))
}

## Explore data "hair1" using boxplot methods 
par(mfrow = c(2,1))
boxplot(hair1[,-12], las = 2, names = variables[-12], cex.axis = 1)

## Create correlation matrix --> Input for a factor analysis
corlnMtrx <- cor(hair1[,-12]) # exclude y variable
corlnMtrx
##                      Product Quality    E-Commerce Technical Support
## Product Quality           1.00000000 -0.1371632174      0.0956004542
## E-Commerce               -0.13716322  1.0000000000      0.0008667887
## Technical Support         0.09560045  0.0008667887      1.0000000000
## Complaint Resolution      0.10637000  0.1401792611      0.0966565978
## Advertising              -0.05347313  0.4298907110     -0.0628700668
## Product Line              0.47749341 -0.0526878383      0.1926254565
## Salesforce Image         -0.15181287  0.7915437115      0.0169905395
## Competitive Pricing      -0.40128188  0.2294624014     -0.2707866821
## Warranty & Claims         0.08831231  0.0518981915      0.7971679258
## Order & Billing           0.10430307  0.1561473316      0.0801018246
## Delivery Speed            0.02771800  0.1916360683      0.0254406935
##                      Complaint Resolution Advertising Product Line
## Product Quality                 0.1063700 -0.05347313   0.47749341
## E-Commerce                      0.1401793  0.42989071  -0.05268784
## Technical Support               0.0966566 -0.06287007   0.19262546
## Complaint Resolution            1.0000000  0.19691685   0.56141695
## Advertising                     0.1969168  1.00000000  -0.01155082
## Product Line                    0.5614170 -0.01155082   1.00000000
## Salesforce Image                0.2297518  0.54220366  -0.06131553
## Competitive Pricing            -0.1279543  0.13421689  -0.49494840
## Warranty & Claims               0.1404083  0.01079207   0.27307753
## Order & Billing                 0.7568686  0.18423559   0.42440825
## Delivery Speed                  0.8650917  0.27586308   0.60185021
##                      Salesforce Image Competitive Pricing
## Product Quality           -0.15181287         -0.40128188
## E-Commerce                 0.79154371          0.22946240
## Technical Support          0.01699054         -0.27078668
## Complaint Resolution       0.22975176         -0.12795425
## Advertising                0.54220366          0.13421689
## Product Line              -0.06131553         -0.49494840
## Salesforce Image           1.00000000          0.26459655
## Competitive Pricing        0.26459655          1.00000000
## Warranty & Claims          0.10745534         -0.24498605
## Order & Billing            0.19512741         -0.11456703
## Delivery Speed             0.27155126         -0.07287173
##                      Warranty & Claims Order & Billing Delivery Speed
## Product Quality             0.08831231      0.10430307     0.02771800
## E-Commerce                  0.05189819      0.15614733     0.19163607
## Technical Support           0.79716793      0.08010182     0.02544069
## Complaint Resolution        0.14040830      0.75686859     0.86509170
## Advertising                 0.01079207      0.18423559     0.27586308
## Product Line                0.27307753      0.42440825     0.60185021
## Salesforce Image            0.10745534      0.19512741     0.27155126
## Competitive Pricing        -0.24498605     -0.11456703    -0.07287173
## Warranty & Claims           1.00000000      0.19706512     0.10939460
## Order & Billing             0.19706512      1.00000000     0.75100307
## Delivery Speed              0.10939460      0.75100307     1.00000000
### Factor analysis steps #########################
## 1) Initial extraction
# Initial run without rotation
#library(psych)
fit1 <- factanal(hair1[,-12], 6, rotation="none" )
print(fit1) # we can check eigenvalues with this intial run
## 
## Call:
## factanal(x = hair1[, -12], factors = 6, rotation = "none")
## 
## Uniquenesses:
##      Product Quality           E-Commerce    Technical Support 
##                0.440                0.364                0.005 
## Complaint Resolution          Advertising         Product Line 
##                0.203                0.666                0.005 
##     Salesforce Image  Competitive Pricing    Warranty & Claims 
##                0.005                0.596                0.304 
##      Order & Billing       Delivery Speed 
##                0.005                0.015 
## 
## Loadings:
##                      Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## Product Quality       0.264  -0.299  -0.118   0.260  -0.369   0.428 
## E-Commerce            0.209   0.660   0.336   0.188                 
## Technical Support     0.316  -0.477   0.809  -0.112                 
## Complaint Resolution  0.816   0.123  -0.147           0.274         
## Advertising           0.204   0.493   0.144                   0.104 
## Product Line          0.778  -0.357  -0.211   0.463                 
## Salesforce Image      0.272   0.822   0.433   0.238                 
## Competitive Pricing  -0.298   0.464          -0.170   0.223  -0.143 
## Warranty & Claims     0.400  -0.323   0.633                  -0.137 
## Order & Billing       0.844   0.183  -0.184  -0.455                 
## Delivery Speed        0.845   0.180  -0.205           0.443         
## 
##                Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## SS loadings      3.275   2.199   1.533   0.644   0.493   0.247
## Proportion Var   0.298   0.200   0.139   0.059   0.045   0.022
## Cumulative Var   0.298   0.498   0.637   0.696   0.740   0.763
## 
## Test of the hypothesis that 6 factors are sufficient.
## The chi square statistic is 4.04 on 4 degrees of freedom.
## The p-value is 0.401
# Initial run with rotation
#library(psych)
fit2 <- factanal(hair1[,-12], 6, rotation="varimax" )
print(fit2) # we can check eigenvalues with this intial run
## 
## Call:
## factanal(x = hair1[, -12], factors = 6, rotation = "varimax")
## 
## Uniquenesses:
##      Product Quality           E-Commerce    Technical Support 
##                0.440                0.364                0.005 
## Complaint Resolution          Advertising         Product Line 
##                0.203                0.666                0.005 
##     Salesforce Image  Competitive Pricing    Warranty & Claims 
##                0.005                0.596                0.304 
##      Order & Billing       Delivery Speed 
##                0.005                0.015 
## 
## Loadings:
##                      Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## Product Quality                               0.739                 
## E-Commerce                    0.782          -0.106                 
## Technical Support                     0.987   0.102                 
## Complaint Resolution  0.865   0.158           0.137                 
## Advertising           0.189   0.530                                 
## Product Line          0.542           0.141   0.678  -0.117   0.453 
## Salesforce Image              0.983          -0.119                 
## Competitive Pricing           0.228  -0.219  -0.535                 
## Warranty & Claims                     0.809   0.116           0.100 
## Order & Billing       0.827   0.111                   0.527         
## Delivery Speed        0.961   0.184                  -0.117   0.104 
## 
##                Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## SS loadings      2.714   1.994   1.718   1.376   0.324   0.266
## Proportion Var   0.247   0.181   0.156   0.125   0.029   0.024
## Cumulative Var   0.247   0.428   0.584   0.709   0.739   0.763
## 
## Test of the hypothesis that 6 factors are sufficient.
## The chi square statistic is 4.04 on 4 degrees of freedom.
## The p-value is 0.401
# Bartlett's test of sphericity
cortest.bartlett(corlnMtrx, 100)
## $chisq
## [1] 619.2726
## 
## $p.value
## [1] 1.79337e-96
## 
## $df
## [1] 55
# Bartlett’s test of sphericity is a test statistic used to examine the null hypothesis that the variables are uncorrelated in the population.
# Small values (less than 0.05) of the p-value indicate that a factor analysis may be useful with your data. 
## 2) Determine number of factors to retain
# (i) scree plot
# Eigenvalues
A <- eigen(corlnMtrx)
EV <- A$values

# Use EV as an input to create a scree plot and add lines.
plot(EV, main = "Scree Plot", xlab = "Factors", ylab = "Eigen Values", pch = 20, col = "blue")
lines(EV, col = "red")
abline(h = 1, col = "green", lty = 2) # look at the bending point; works as a cut-off

# (ii) Kaiser Rule: Keep all factors with EV > 1
# (iii) Proportion of varianve: factors to explain 50% + of variance --> Look at 'Cumulative Var' in the output of the factor analysis
## 3) Varimax rotation
#library(psych)
fit3 <- factanal(hair1[,-12], 4, rotation="varimax")
print(fit3)
## 
## Call:
## factanal(x = hair1[, -12], factors = 4, rotation = "varimax")
## 
## Uniquenesses:
##      Product Quality           E-Commerce    Technical Support 
##                0.682                0.360                0.228 
## Complaint Resolution          Advertising         Product Line 
##                0.178                0.679                0.005 
##     Salesforce Image  Competitive Pricing    Warranty & Claims 
##                0.017                0.636                0.163 
##      Order & Billing       Delivery Speed 
##                0.347                0.076 
## 
## Loadings:
##                      Factor1 Factor2 Factor3 Factor4
## Product Quality                               0.557 
## E-Commerce                    0.793                 
## Technical Support                     0.872   0.102 
## Complaint Resolution  0.884   0.142           0.135 
## Advertising           0.190   0.521          -0.110 
## Product Line          0.502           0.104   0.856 
## Salesforce Image      0.119   0.974          -0.130 
## Competitive Pricing           0.225  -0.216  -0.514 
## Warranty & Claims                     0.894   0.158 
## Order & Billing       0.794   0.101   0.105         
## Delivery Speed        0.928   0.189           0.164 
## 
##                Factor1 Factor2 Factor3 Factor4
## SS loadings      2.592   1.977   1.638   1.423
## Proportion Var   0.236   0.180   0.149   0.129
## Cumulative Var   0.236   0.415   0.564   0.694
## 
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 24.26 on 17 degrees of freedom.
## The p-value is 0.113
## 4) Interpret solution: Name main factors --> Functions/ Images / Technique and warranty / Product line

## 5) Calculate factor scores with a rotated method
fit4 <- factanal(hair1[,-12], 4, rotation="varimax", scores = "regression")
print(fit4)
## 
## Call:
## factanal(x = hair1[, -12], factors = 4, scores = "regression",     rotation = "varimax")
## 
## Uniquenesses:
##      Product Quality           E-Commerce    Technical Support 
##                0.682                0.360                0.228 
## Complaint Resolution          Advertising         Product Line 
##                0.178                0.679                0.005 
##     Salesforce Image  Competitive Pricing    Warranty & Claims 
##                0.017                0.636                0.163 
##      Order & Billing       Delivery Speed 
##                0.347                0.076 
## 
## Loadings:
##                      Factor1 Factor2 Factor3 Factor4
## Product Quality                               0.557 
## E-Commerce                    0.793                 
## Technical Support                     0.872   0.102 
## Complaint Resolution  0.884   0.142           0.135 
## Advertising           0.190   0.521          -0.110 
## Product Line          0.502           0.104   0.856 
## Salesforce Image      0.119   0.974          -0.130 
## Competitive Pricing           0.225  -0.216  -0.514 
## Warranty & Claims                     0.894   0.158 
## Order & Billing       0.794   0.101   0.105         
## Delivery Speed        0.928   0.189           0.164 
## 
##                Factor1 Factor2 Factor3 Factor4
## SS loadings      2.592   1.977   1.638   1.423
## Proportion Var   0.236   0.180   0.149   0.129
## Cumulative Var   0.236   0.415   0.564   0.694
## 
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 24.26 on 17 degrees of freedom.
## The p-value is 0.113
# Save factor scores for further analysis
factor_scores <- as.data.frame(fit4$scores)
## Run a regression model with factors ##########################
## Create a data set
hair_factor <- cbind(hair, factor_scores) #factor scores work as new X variables instead of original X variables

hair_regression <- lm(Satisfaction ~ Factor1 + Factor2 + Factor3 + Factor4, data = hair_factor)

summary(hair_regression)
## 
## Call:
## lm(formula = Satisfaction ~ Factor1 + Factor2 + Factor3 + Factor4, 
##     data = hair_factor)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.00581 -0.57872  0.04176  0.54492  1.72415 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.91800    0.07632  90.645  < 2e-16 ***
## Factor1      0.57483    0.07952   7.229 1.22e-10 ***
## Factor2      0.58816    0.07758   7.581 2.27e-11 ***
## Factor3      0.08588    0.08136   1.056    0.294    
## Factor4      0.43295    0.07812   5.542 2.66e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7632 on 95 degrees of freedom
## Multiple R-squared:  0.6065, Adjusted R-squared:   0.59 
## F-statistic: 36.61 on 4 and 95 DF,  p-value: < 2.2e-16