library(nFactors)
## Loading required package: MASS
## Loading required package: psych
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:psych':
##
## logit
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
##
## Attaching package: 'nFactors'
## The following object is masked from 'package:lattice':
##
## parallel
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(corrplot)
## corrplot 0.84 loaded
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(psych)
library(DataExplorer)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:boot':
##
## logit
## The following object is masked from 'package:psych':
##
## logit
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
options(knitr.table.format = "html")
hair <- read.csv("Factor-Hair-Revised.csv")
dim(hair) ## checking the dimensionality of the dataset
## [1] 100 13
any(is.na(hair)) ## checking for any missing values in dataset
## [1] FALSE
Using the plot_intro function call to check the real structure of dataset
### Plotting the configuration of Dataset
plot_intro(hair)
Evident enough there are no discrete columns and missing observations in the dataset
Checking the structure of the Dataset
str(hair)
## 'data.frame': 100 obs. of 13 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ ProdQual : num 8.5 8.2 9.2 6.4 9 6.5 6.9 6.2 5.8 6.4 ...
## $ Ecom : num 3.9 2.7 3.4 3.3 3.4 2.8 3.7 3.3 3.6 4.5 ...
## $ TechSup : num 2.5 5.1 5.6 7 5.2 3.1 5 3.9 5.1 5.1 ...
## $ CompRes : num 5.9 7.2 5.6 3.7 4.6 4.1 2.6 4.8 6.7 6.1 ...
## $ Advertising : num 4.8 3.4 5.4 4.7 2.2 4 2.1 4.6 3.7 4.7 ...
## $ ProdLine : num 4.9 7.9 7.4 4.7 6 4.3 2.3 3.6 5.9 5.7 ...
## $ SalesFImage : num 6 3.1 5.8 4.5 4.5 3.7 5.4 5.1 5.8 5.7 ...
## $ ComPricing : num 6.8 5.3 4.5 8.8 6.8 8.5 8.9 6.9 9.3 8.4 ...
## $ WartyClaim : num 4.7 5.5 6.2 7 6.1 5.1 4.8 5.4 5.9 5.4 ...
## $ OrdBilling : num 5 3.9 5.4 4.3 4.5 3.6 2.1 4.3 4.4 4.1 ...
## $ DelSpeed : num 3.7 4.9 4.5 3 3.5 3.3 2 3.7 4.6 4.4 ...
## $ Satisfaction: num 8.2 5.7 8.9 4.8 7.1 4.7 5.7 6.3 7 5.5 ...
Conducting a 5 point summary of the variables in the dataset to detect outliers in plain sight before going deeper
Visibly there are not outliers in the dataset
summary(hair)
## ID ProdQual Ecom TechSup
## Min. : 1.00 Min. : 5.000 Min. :2.200 Min. :1.300
## 1st Qu.: 25.75 1st Qu.: 6.575 1st Qu.:3.275 1st Qu.:4.250
## Median : 50.50 Median : 8.000 Median :3.600 Median :5.400
## Mean : 50.50 Mean : 7.810 Mean :3.672 Mean :5.365
## 3rd Qu.: 75.25 3rd Qu.: 9.100 3rd Qu.:3.925 3rd Qu.:6.625
## Max. :100.00 Max. :10.000 Max. :5.700 Max. :8.500
## CompRes Advertising ProdLine SalesFImage
## Min. :2.600 Min. :1.900 Min. :2.300 Min. :2.900
## 1st Qu.:4.600 1st Qu.:3.175 1st Qu.:4.700 1st Qu.:4.500
## Median :5.450 Median :4.000 Median :5.750 Median :4.900
## Mean :5.442 Mean :4.010 Mean :5.805 Mean :5.123
## 3rd Qu.:6.325 3rd Qu.:4.800 3rd Qu.:6.800 3rd Qu.:5.800
## Max. :7.800 Max. :6.500 Max. :8.400 Max. :8.200
## ComPricing WartyClaim OrdBilling DelSpeed
## Min. :3.700 Min. :4.100 Min. :2.000 Min. :1.600
## 1st Qu.:5.875 1st Qu.:5.400 1st Qu.:3.700 1st Qu.:3.400
## Median :7.100 Median :6.100 Median :4.400 Median :3.900
## Mean :6.974 Mean :6.043 Mean :4.278 Mean :3.886
## 3rd Qu.:8.400 3rd Qu.:6.600 3rd Qu.:4.800 3rd Qu.:4.425
## Max. :9.900 Max. :8.100 Max. :6.700 Max. :5.500
## Satisfaction
## Min. :4.700
## 1st Qu.:6.000
## Median :7.050
## Mean :6.918
## 3rd Qu.:7.625
## Max. :9.900
plot_histogram(hair)
plot_density(hair)
Density plots reveal some are left skewed like Delivery Speed and Tech support to some extent.
Some are right skewed Sales Force Image
Some are bimodal like Product quality and Waranty Claims
Most resemble a normal distribution Ecommerce , Complaint Resolution.
## Extracting the sub-dataset of relevant Variables
## ID column and Dependent Variable Satisfaction has been removed
hair.corr = hair[,2:12]
class(hair.corr)
## [1] "data.frame"
cor.h = round(cor(hair.corr),3)
print(cor.h) ## correlation object for visual recognition of actual values amongst predictors
## ProdQual Ecom TechSup CompRes Advertising ProdLine
## ProdQual 1.000 -0.137 0.096 0.106 -0.053 0.477
## Ecom -0.137 1.000 0.001 0.140 0.430 -0.053
## TechSup 0.096 0.001 1.000 0.097 -0.063 0.193
## CompRes 0.106 0.140 0.097 1.000 0.197 0.561
## Advertising -0.053 0.430 -0.063 0.197 1.000 -0.012
## ProdLine 0.477 -0.053 0.193 0.561 -0.012 1.000
## SalesFImage -0.152 0.792 0.017 0.230 0.542 -0.061
## ComPricing -0.401 0.229 -0.271 -0.128 0.134 -0.495
## WartyClaim 0.088 0.052 0.797 0.140 0.011 0.273
## OrdBilling 0.104 0.156 0.080 0.757 0.184 0.424
## DelSpeed 0.028 0.192 0.025 0.865 0.276 0.602
## SalesFImage ComPricing WartyClaim OrdBilling DelSpeed
## ProdQual -0.152 -0.401 0.088 0.104 0.028
## Ecom 0.792 0.229 0.052 0.156 0.192
## TechSup 0.017 -0.271 0.797 0.080 0.025
## CompRes 0.230 -0.128 0.140 0.757 0.865
## Advertising 0.542 0.134 0.011 0.184 0.276
## ProdLine -0.061 -0.495 0.273 0.424 0.602
## SalesFImage 1.000 0.265 0.107 0.195 0.272
## ComPricing 0.265 1.000 -0.245 -0.115 -0.073
## WartyClaim 0.107 -0.245 1.000 0.197 0.109
## OrdBilling 0.195 -0.115 0.197 1.000 0.751
## DelSpeed 0.272 -0.073 0.109 0.751 1.000
# Corrplot method for visual analysis
corrplot(cor.h, method = "shade")
For evidence of multicollinearity amongst the variables of hair dataset Variance Inflation Factors (VIF) concept was used
Any variable having value of VIF > 4 suggests presence of multicollinearity amongst predictor variables
# modelling the Response variable against the predictors using linear Regression
com1 = lm(Satisfaction ~ . , data = hair)
vif(com1)
## ID ProdQual Ecom TechSup CompRes Advertising
## 1.109457 1.671241 2.776079 2.985817 4.732628 1.525244
## ProdLine SalesFImage ComPricing WartyClaim OrdBilling DelSpeed
## 3.494539 3.457106 1.638619 3.224552 2.941208 6.525324
VIF for Delivery speed was found to be 6.525324 ( greater than 4)
Also VIF for Complaint Resolution CompRes was 4.73 suggesting presence of multicollinearity which can destablise the Regression model
Simple linear regression models for all 11 factors with Response variable - Satisfaction was done using lm function
lm.ProdQual = lm(Satisfaction ~ ProdQual, hair)
lm.ProdQual
##
## Call:
## lm(formula = Satisfaction ~ ProdQual, data = hair)
##
## Coefficients:
## (Intercept) ProdQual
## 3.6759 0.4151
Satisfaction = 3.6759 + 0.4151 * ProdQual
1.beta-naught or intercept coefficient is equal to 3.6759
2.beta-slope or the variable coefficient Product quality = 0.4151
3.for any one unit change in product quality Satisfaction rating would improve by 0.4151 keeping other things constant as explained by model
lm.Ecom = lm(Satisfaction ~ Ecom, hair)
lm.Ecom
##
## Call:
## lm(formula = Satisfaction ~ Ecom, data = hair)
##
## Coefficients:
## (Intercept) Ecom
## 5.1516 0.4811
Satisfaction = 5.1516 + 0.4811 * Ecom
lm.TechSup = lm(Satisfaction ~ TechSup, hair)
lm.TechSup
##
## Call:
## lm(formula = Satisfaction ~ TechSup, data = hair)
##
## Coefficients:
## (Intercept) TechSup
## 6.44757 0.08768
Satisfaction = 6.44757 + 0.08768 * TechSup
lm.CompRes = lm(Satisfaction ~ CompRes, hair)
lm.CompRes
##
## Call:
## lm(formula = Satisfaction ~ CompRes, data = hair)
##
## Coefficients:
## (Intercept) CompRes
## 3.680 0.595
Satisfaction = 3.680 + 0.595 * CompRes
lm.Advertising = lm(Satisfaction ~ Advertising, hair)
lm.Advertising
##
## Call:
## lm(formula = Satisfaction ~ Advertising, data = hair)
##
## Coefficients:
## (Intercept) Advertising
## 5.6259 0.3222
Satisfaction = 5.6259 + 0.3222 * Advertising
lm.ProdLine = lm(Satisfaction ~ ProdLine, hair)
lm.ProdLine
##
## Call:
## lm(formula = Satisfaction ~ ProdLine, data = hair)
##
## Coefficients:
## (Intercept) ProdLine
## 4.0220 0.4989
Satisfaction = 4.0220 + 0.4989 * ProdLine
lm.SalesFImage = lm(Satisfaction ~ SalesFImage, hair)
lm.SalesFImage
##
## Call:
## lm(formula = Satisfaction ~ SalesFImage, data = hair)
##
## Coefficients:
## (Intercept) SalesFImage
## 4.070 0.556
Satisfaction = 4.070 + 0.556 * SalesFImage
lm.ComPricing = lm(Satisfaction ~ ComPricing, hair)
lm.ComPricing
##
## Call:
## lm(formula = Satisfaction ~ ComPricing, data = hair)
##
## Coefficients:
## (Intercept) ComPricing
## 8.0386 -0.1607
Satisfaction = 8.0386 + (-0.1607) * ComPricing
lm.WartyClaim = lm(Satisfaction ~ WartyClaim, hair)
lm.WartyClaim
##
## Call:
## lm(formula = Satisfaction ~ WartyClaim, data = hair)
##
## Coefficients:
## (Intercept) WartyClaim
## 5.3581 0.2581
Satisfaction = 5.3581 + 0.2581 * WartyClaim
lm.OrdBilling = lm(Satisfaction ~ OrdBilling, hair)
lm.OrdBilling
##
## Call:
## lm(formula = Satisfaction ~ OrdBilling, data = hair)
##
## Coefficients:
## (Intercept) OrdBilling
## 4.0541 0.6695
Satisfaction = 4.0541 + 0.6695 * OrdBilling
lm.DelSpeed = lm(Satisfaction ~ DelSpeed, hair)
lm.DelSpeed
##
## Call:
## lm(formula = Satisfaction ~ DelSpeed, data = hair)
##
## Coefficients:
## (Intercept) DelSpeed
## 3.2791 0.9364
Satisfaction = 3.2791 + 0.9364 * DelSpeed
Conducting a bartlett sphericity test to check whether Principal Component Analysis can be done on the predictor variables of the dataset
cortest.bartlett(cor.h, nrow(hair.corr))
## $chisq
## [1] 619.3976
##
## $p.value
## [1] 1.693724e-96
##
## $df
## [1] 55
Since the p value for the test is quite less signficance level of alpha = 0.001 so we reject the null hypothesis Ho (that PCA cannot be conducted implying that there is no correlation amongst the predictor variables)
Using the rotation type of varimax we conduct the PCA analysis with 4 factors Dataset hair.corr has all 11 predictor variables (minus the ID column and dependent variable Satisfaction ratings)
hair.pca = principal(hair.corr, nfactors = 4, rotate = "varimax")
hair.pca
## Principal Components Analysis
## Call: principal(r = hair.corr, nfactors = 4, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 RC3 RC4 h2 u2 com
## ProdQual 0.00 -0.01 -0.03 0.88 0.77 0.232 1.0
## Ecom 0.06 0.87 0.05 -0.12 0.78 0.223 1.1
## TechSup 0.02 -0.02 0.94 0.10 0.89 0.107 1.0
## CompRes 0.93 0.12 0.05 0.09 0.88 0.119 1.1
## Advertising 0.14 0.74 -0.08 0.01 0.58 0.424 1.1
## ProdLine 0.59 -0.06 0.15 0.64 0.79 0.213 2.1
## SalesFImage 0.13 0.90 0.08 -0.16 0.86 0.141 1.1
## ComPricing -0.09 0.23 -0.25 -0.72 0.64 0.359 1.5
## WartyClaim 0.11 0.05 0.93 0.10 0.89 0.108 1.1
## OrdBilling 0.86 0.11 0.08 0.04 0.77 0.234 1.1
## DelSpeed 0.94 0.18 0.00 0.05 0.91 0.086 1.1
##
## RC1 RC2 RC3 RC4
## SS loadings 2.89 2.23 1.86 1.77
## Proportion Var 0.26 0.20 0.17 0.16
## Cumulative Var 0.26 0.47 0.63 0.80
## Proportion Explained 0.33 0.26 0.21 0.20
## Cumulative Proportion 0.33 0.59 0.80 1.00
##
## Mean item complexity = 1.2
## Test of the hypothesis that 4 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.06
## with the empirical chi square 39.02 with prob < 0.0018
##
## Fit based upon off diagonal values = 0.97
The 4 RCs explain explain about 80 % of cumulative variation in the dataset which is good number
After studying the PCA results on hair dataset an arbitrary number was choosen as cutoff (0.6) to check whether the variablity of the predictors can be explained by single components. It worked and we can see that every input variable can be explained by the single set of Components (RCs )
print(hair.pca$loadings, cutoff = 0.6)
##
## Loadings:
## RC1 RC2 RC3 RC4
## ProdQual 0.876
## Ecom 0.871
## TechSup 0.939
## CompRes 0.926
## Advertising 0.742
## ProdLine 0.642
## SalesFImage 0.900
## ComPricing -0.723
## WartyClaim 0.931
## OrdBilling 0.864
## DelSpeed 0.938
##
## RC1 RC2 RC3 RC4
## SS loadings 2.893 2.234 1.856 1.774
## Proportion Var 0.263 0.203 0.169 0.161
## Cumulative Var 0.263 0.466 0.635 0.796
Scores for individual IDs (rows of observation) was extracted from the PCA analysis and rounded off to two decimal places for ease of computation
scores = round(hair.pca$scores, 2)
Table for Meaningful names of Principal Components
| Components | Meaningful Names | Column Name |
|---|---|---|
| RC1 | Purchasing Experience | Pchexp |
| RC2 | Brand Recognition | Bdrecog |
| RC3 | After Sales Service | Aftsvc |
| RC4 | Product | Prodt |
Explanation
Score matrix was converted into a data frame and its variables which are nothing but PCA components were given meaningful names for further analysis We achieved a dimensionality reduction where just 4 factors can explain the complete 11 predictor variables of the hair dataset through PCA analysis
as.data.frame(scores)
## RC1 RC2 RC3 RC4
## 1 0.13 0.77 -1.88 0.37
## 2 1.22 -1.65 -0.61 0.81
## 3 0.62 0.58 0.00 1.57
## 4 -0.84 -0.27 1.27 -1.25
## 5 -0.32 -0.83 -0.01 0.45
## 6 -0.65 -1.07 -1.30 -1.05
## 7 -2.63 -0.25 -0.56 -1.23
## 8 -0.28 -0.16 -0.75 -1.01
## 9 1.05 -0.17 -0.09 -1.66
## 10 0.43 0.76 -0.45 -0.89
## 11 -0.14 -0.77 -0.46 0.61
## 12 -1.45 1.36 0.44 -1.07
## 13 0.62 2.11 -0.17 0.87
## 14 0.43 -0.40 0.43 0.90
## 15 1.44 0.66 -0.27 -1.04
## 16 0.92 -1.06 -0.56 1.17
## 17 0.52 -0.32 1.11 -1.03
## 18 1.71 -0.16 0.25 -1.48
## 19 1.16 -0.42 -0.38 -1.76
## 20 0.29 1.78 -0.95 0.24
## 21 -0.62 -0.18 1.53 -1.83
## 22 -0.11 2.83 0.63 2.24
## 23 0.08 -0.35 1.14 1.33
## 24 1.95 -1.67 -0.86 0.50
## 25 0.12 -0.02 0.47 -1.25
## 26 0.57 -0.24 0.62 -1.35
## 27 0.83 -0.99 1.04 0.92
## 28 0.12 -1.11 0.38 -1.36
## 29 1.16 -1.61 -0.06 0.80
## 30 -0.51 0.16 -1.55 -0.31
## 31 -0.81 -0.18 2.26 0.22
## 32 -1.07 -1.60 1.19 -0.07
## 33 -0.50 0.31 0.16 -0.97
## 34 0.28 0.07 -0.03 -0.66
## 35 -1.21 0.61 0.28 -0.69
## 36 -1.38 -1.06 0.28 1.03
## 37 -0.62 -0.24 0.31 0.66
## 38 1.36 0.04 0.11 0.58
## 39 0.60 0.47 -1.29 -0.45
## 40 -0.59 1.48 -1.18 -1.04
## 41 0.19 -0.39 -1.98 -0.60
## 42 0.04 0.09 -1.17 0.54
## 43 0.41 1.96 -1.09 0.99
## 44 0.78 1.61 1.51 -1.15
## 45 1.27 -1.77 -0.98 0.74
## 46 1.06 0.68 0.32 -1.10
## 47 -0.12 -0.09 1.00 1.42
## 48 2.10 0.46 0.84 -1.68
## 49 0.16 0.88 -0.84 1.30
## 50 0.23 0.50 -0.88 1.04
## 51 -0.94 -0.38 0.19 -0.65
## 52 1.56 -1.91 -1.18 0.72
## 53 0.86 -1.09 -0.24 0.87
## 54 -0.82 -0.53 0.54 0.33
## 55 0.54 -0.68 -1.06 -0.81
## 56 -0.37 0.28 0.92 0.60
## 57 1.98 1.43 -0.09 -0.84
## 58 0.21 0.52 0.35 0.86
## 59 -1.34 0.55 0.33 1.94
## 60 0.85 -1.58 0.57 0.74
## 61 0.99 -1.26 1.70 0.79
## 62 -1.10 0.71 -0.15 0.40
## 63 -0.76 0.26 -1.19 0.78
## 64 -1.09 -1.95 0.43 -0.15
## 65 -1.21 0.15 0.58 -0.52
## 66 1.34 0.54 -1.04 -1.25
## 67 0.90 -0.59 2.06 -1.32
## 68 0.42 -0.25 -0.30 -0.85
## 69 -0.87 -0.60 -1.00 -0.53
## 70 0.14 -0.15 -1.28 -1.00
## 71 0.34 2.06 0.69 0.09
## 72 -1.16 -0.18 -1.21 0.71
## 73 0.93 1.32 -1.87 -0.56
## 74 -0.57 1.40 1.23 1.35
## 75 -0.30 0.87 -0.29 0.30
## 76 -0.89 0.23 1.04 1.61
## 77 -0.36 0.14 2.06 -0.63
## 78 0.21 0.34 1.07 0.31
## 79 1.13 0.64 0.44 1.47
## 80 -1.53 0.29 0.03 -0.31
## 81 -0.85 -0.25 0.45 1.53
## 82 0.03 -0.92 0.49 0.40
## 83 -1.39 -0.98 0.21 0.63
## 84 -2.49 -0.74 1.63 -1.44
## 85 1.00 -1.78 0.80 -0.01
## 86 -0.83 -0.42 -1.08 -0.45
## 87 -1.43 -0.30 -2.16 -1.27
## 88 1.07 -1.30 1.40 0.04
## 89 0.09 -0.06 0.13 0.24
## 90 1.08 2.38 1.89 -1.01
## 91 -0.78 0.46 1.39 0.61
## 92 -2.35 -0.26 -0.53 -1.19
## 93 0.30 0.21 -0.37 1.21
## 94 1.11 0.37 0.05 1.45
## 95 -0.80 0.71 -1.09 1.06
## 96 -0.11 0.40 0.05 0.35
## 97 -0.21 -0.25 -1.88 -0.32
## 98 -1.59 -1.12 -1.34 1.24
## 99 -0.33 1.90 0.14 -0.12
## 100 -0.63 0.21 -0.75 -0.70
colnames(scores) = c("Pchexp", "Bdrecog", "Aftsvc", "Prodt")
Score head
print(head(scores))
## Pchexp Bdrecog Aftsvc Prodt
## [1,] 0.13 0.77 -1.88 0.37
## [2,] 1.22 -1.65 -0.61 0.81
## [3,] 0.62 0.58 0.00 1.57
## [4,] -0.84 -0.27 1.27 -1.25
## [5,] -0.32 -0.83 -0.01 0.45
## [6,] -0.65 -1.07 -1.30 -1.05
Score dataframe was combined with a smaller subset (extracted dataframe - hair_new) having ID and Satisfaction ratings as columns to form a meaningful dataset devoid of multicollinearity and managable predictor variables (just 4) for further Regression model building
hair_new head
hair_s = hair %>% select(c("ID", "Satisfaction"))
hair_new = cbind(hair_s, scores)
print(head(hair_new))
## ID Satisfaction Pchexp Bdrecog Aftsvc Prodt
## 1 1 8.2 0.13 0.77 -1.88 0.37
## 2 2 5.7 1.22 -1.65 -0.61 0.81
## 3 3 8.9 0.62 0.58 0.00 1.57
## 4 4 4.8 -0.84 -0.27 1.27 -1.25
## 5 5 7.1 -0.32 -0.83 -0.01 0.45
## 6 6 4.7 -0.65 -1.07 -1.30 -1.05
m.linear.RegModel = lm(Satisfaction ~ Pchexp + Bdrecog + Aftsvc + Prodt, hair_new)
summary(m.linear.RegModel)
##
## Call:
## lm(formula = Satisfaction ~ Pchexp + Bdrecog + Aftsvc + Prodt,
## data = hair_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6346 -0.5021 0.1368 0.4617 1.5235
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.91813 0.07087 97.617 < 2e-16 ***
## Pchexp 0.61799 0.07122 8.677 1.11e-13 ***
## Bdrecog 0.50994 0.07123 7.159 1.71e-10 ***
## Aftsvc 0.06686 0.07120 0.939 0.35
## Prodt 0.54014 0.07124 7.582 2.27e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7087 on 95 degrees of freedom
## Multiple R-squared: 0.6607, Adjusted R-squared: 0.6464
## F-statistic: 46.25 on 4 and 95 DF, p-value: < 2.2e-16
Summary Explained
Looking at the Pr(t) values of Coefficients like Intercept (constant beta-naught) we see that it is significant even at 0.001 level. so it definitely not zero and contributes to regression model
Similarly predictor variables like Purchase experience, Brand Recognition and Product have significant betas implying that Response variable Satisfaction is linearly associated with them
After sales service is the only variable which has some high p-value implying that its beta cofficient may not be contributing that significantly to the model or may be zero
All together Adj-R^2 explains that these predictors explains the 64.6 % of the variability in the dataset which is still good enough (may not fall in execellent category)
Overall p-value (extremely less e raise to minus 16) of Model given by F-statistic gives evidence against the null-hypothesis. Model is significantly valid at this point
Using the newly built multiple regression model new Satisfaction scores were predicted (pred.Satisfn) to check the validity of the model New dataframe hair_new was formed to have columns as 1. IDs, 2. Satisfaction ratings 3. Purchase Experience 4. Brand Recognition 5. After Sales service 6. predicted satisfaction (from multiple linear model)
pred.Satisfn = predict(m.linear.RegModel)
as.data.frame(pred.Satisfn)
## pred.Satisfn
## 1 7.465282
## 2 7.227410
## 3 8.445062
## 4 5.671074
## 5 6.539523
## 6 5.316752
## 7 4.463536
## 8 6.067824
## 9 6.577679
## 10 7.060607
## 11 6.737694
## 12 6.167033
## 13 8.835800
## 14 7.494763
## 15 7.564792
## 16 7.540669
## 17 6.594171
## 18 7.110605
## 19 6.444772
## 20 8.071152
## 21 5.557027
## 22 9.545299
## 23 7.583692
## 24 7.484183
## 25 6.338339
## 26 6.460261
## 27 7.492680
## 28 5.717078
## 29 7.242097
## 30 6.413479
## 31 6.595699
## 32 5.482739
## 33 6.253981
## 34 6.768365
## 35 6.127453
## 36 6.099842
## 37 6.789812
## 38 8.099623
## 39 7.199285
## 40 6.667590
## 41 6.380216
## 42 7.202198
## 43 8.632843
## 44 7.700948
## 45 7.134571
## 46 7.347193
## 47 7.631931
## 48 7.599196
## 49 8.111773
## 50 7.818146
## 51 5.805062
## 52 7.218221
## 53 7.347644
## 54 6.355465
## 55 6.396708
## 56 7.217849
## 57 8.411216
## 58 7.800993
## 59 7.440426
## 60 7.075531
## 61 7.427782
## 62 6.806428
## 63 6.922795
## 64 5.197880
## 65 6.004763
## 66 7.276893
## 67 6.598196
## 68 6.571026
## 69 5.721393
## 70 6.302445
## 71 8.273456
## 72 6.412083
## 73 7.738474
## 74 8.091208
## 75 7.319033
## 76 7.424562
## 77 6.564483
## 78 7.460264
## 79 8.766233
## 80 5.955058
## 81 7.121857
## 82 6.716345
## 83 5.913722
## 84 4.333171
## 85 6.676514
## 86 5.875764
## 87 5.051047
## 88 7.031663
## 89 7.081478
## 90 8.380018
## 91 7.093087
## 92 4.655084
## 93 7.839444
## 94 8.579314
## 95 7.285471
## 96 7.246518
## 97 6.362337
## 98 5.944592
## 99 7.627616
## 100 6.207648
hair_new = cbind(hair_new, pred.Satisfn)
hair_new$pred.Satisfn = round(hair_new$pred.Satisfn, 1)
head_new head after combinig predicted values
print(head(hair_new))
## ID Satisfaction Pchexp Bdrecog Aftsvc Prodt pred.Satisfn
## 1 1 8.2 0.13 0.77 -1.88 0.37 7.5
## 2 2 5.7 1.22 -1.65 -0.61 0.81 7.2
## 3 3 8.9 0.62 0.58 0.00 1.57 8.4
## 4 4 4.8 -0.84 -0.27 1.27 -1.25 5.7
## 5 5 7.1 -0.32 -0.83 -0.01 0.45 6.5
## 6 6 4.7 -0.65 -1.07 -1.30 -1.05 5.3
Plot analysis revealed that our new MLR Regression model is quite good and close to actual Satisfaction scores Blue dots represent Actual Satisfaction ratings Red dots represent Predicted satisfaction scores derived from multiple linear regression model
plot(hair_new$Satisfaction, col="blue", xlab = "Data points or IDs",
ylab = "Satisfacton score",
type = "b", cex = 1, pch = 21, bg = "blue")
lines(hair_new$pred.Satisfn, col= "Red", type = "b")
text(28, 9.9, "Actual value", col = "blue")
text(14.5, 9, "Predicted value", col ="red")
Nutshell we can say that the " Satisfaction" ratings of hair product depends very highly on the overall Purchasing experience of the Customer i.e. how quickly his product is delivered, its billed and if their are complaints are resolved in shortest possible time
Brand Recognition or products adverstising comes in second in mind. Product itself comes in third in order to satisfy the customer (though statistically it variance explaination capacity has been ranked 4th ) and After sales contribution in fourth
There can be differences in the real operating world and these statistical models but this model comes closest to explaining the data provided for deduction