setwd("/Users/anand/RProjects/Linear Regression")
getwd()
## [1] "/Users/anand/RProjects/Linear Regression"
per_index <- read.csv("Performance Index.csv")
head(per_index)
## empid jpi aptitude tol technical general
## 1 1 45.52 43.83 55.92 51.82 43.58
## 2 2 40.10 32.71 32.56 51.49 51.03
## 3 3 50.61 56.64 54.84 52.29 52.47
## 4 4 38.97 51.53 59.69 47.48 47.69
## 5 5 41.87 51.35 51.50 47.59 45.77
## 6 6 38.71 39.60 43.63 48.34 42.06
# Remove Column empID as it is not an independent variable.
per_index$empid <- NULL
str(per_index)
## 'data.frame': 33 obs. of 5 variables:
## $ jpi : num 45.5 40.1 50.6 39 41.9 ...
## $ aptitude : num 43.8 32.7 56.6 51.5 51.4 ...
## $ tol : num 55.9 32.6 54.8 59.7 51.5 ...
## $ technical: num 51.8 51.5 52.3 47.5 47.6 ...
## $ general : num 43.6 51 52.5 47.7 45.8 ...
summary(per_index)
## jpi aptitude tol technical
## Min. :31.64 Min. :32.71 Min. :32.56 Min. :41.25
## 1st Qu.:41.19 1st Qu.:45.59 1st Qu.:44.89 1st Qu.:48.34
## Median :49.45 Median :53.38 Median :57.04 Median :51.64
## Mean :47.87 Mean :52.66 Mean :53.99 Mean :52.02
## 3rd Qu.:53.92 3rd Qu.:56.75 3rd Qu.:61.28 3rd Qu.:54.68
## Max. :66.39 Max. :75.03 Max. :68.53 Max. :67.27
## general
## Min. :37.00
## 1st Qu.:45.07
## Median :50.53
## Mean :49.04
## 3rd Qu.:53.50
## Max. :58.90
Testing assumption of SLR and some EDA on Data
shapiro.test(per_index$jpi)
##
## Shapiro-Wilk normality test
##
## data: per_index$jpi
## W = 0.97326, p-value = 0.575
Since p-value > 0.05 , we fail to reject Ho.
Let’s check for distribution and skewness of dependent variable visually
boxplot(per_index$jpi)
Lets run a pair plot to check corelation between dependent variables.
pairs(~jpi+aptitude+tol+technical+general, data = per_index, col="red")
# alternatively use >> pairs(~., data = per_index, col="red") to select all columns
PERFORM MULTIPLE LINEAR REGRESSION
per_index_model <- lm(jpi~., data = per_index);per_index_model
##
## Call:
## lm(formula = jpi ~ ., data = per_index)
##
## Coefficients:
## (Intercept) aptitude tol technical general
## -54.28225 0.32356 0.03337 1.09547 0.53683
SUMMARIZE MODEL FOR GLOBAL HYPOTHESIS TESTING
summary(per_index_model)
##
## Call:
## lm(formula = jpi ~ ., data = per_index)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.2891 -2.7692 0.4562 2.8508 5.6068
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -54.28225 7.39453 -7.341 5.41e-08 ***
## aptitude 0.32356 0.06778 4.774 5.15e-05 ***
## tol 0.03337 0.07124 0.468 0.6431
## technical 1.09547 0.18138 6.039 1.65e-06 ***
## general 0.53683 0.15840 3.389 0.0021 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.549 on 28 degrees of freedom
## Multiple R-squared: 0.8768, Adjusted R-squared: 0.8592
## F-statistic: 49.81 on 4 and 28 DF, p-value: 2.467e-12
tol
gives a p value much higher. So we shall remove tol and formulate model again.
per_index_model <- lm(jpi~aptitude+technical+general, data = per_index); per_index_model
##
## Call:
## lm(formula = jpi ~ aptitude + technical + general, data = per_index)
##
## Coefficients:
## (Intercept) aptitude technical general
## -54.4064 0.3333 1.1166 0.5432
summary(per_index_model)
##
## Call:
## lm(formula = jpi ~ aptitude + technical + general, data = per_index)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.4909 -2.7215 0.5793 2.9169 5.5226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -54.40644 7.28964 -7.464 3.17e-08 ***
## aptitude 0.33335 0.06361 5.241 1.30e-05 ***
## technical 1.11663 0.17329 6.444 4.75e-07 ***
## general 0.54316 0.15569 3.489 0.00157 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.501 on 29 degrees of freedom
## Multiple R-squared: 0.8758, Adjusted R-squared: 0.863
## F-statistic: 68.18 on 3 and 29 DF, p-value: 3.026e-13
PREDICT DEPENDENT VARIABLE VALUE using fitted()
per_index$predicted <- fitted(per_index_model)
head(per_index)
## jpi aptitude tol technical general predicted
## 1 45.52 43.83 55.92 51.82 43.58 41.73850
## 2 40.10 32.71 32.56 51.49 51.03 41.70973
## 3 50.61 56.64 54.84 52.29 52.47 51.36215
## 4 38.97 51.53 59.69 47.48 47.69 41.69149
## 5 41.87 51.35 51.50 47.59 45.77 40.71145
## 6 38.71 39.60 43.63 48.34 42.06 35.61699
PREDICT DEPENDENT VARIABLE VALUE using predict()
per_index$pred_usingpredicted <- predict(per_index_model, per_index)
head(per_index)
## jpi aptitude tol technical general predicted pred_usingpredicted
## 1 45.52 43.83 55.92 51.82 43.58 41.73850 41.73850
## 2 40.10 32.71 32.56 51.49 51.03 41.70973 41.70973
## 3 50.61 56.64 54.84 52.29 52.47 51.36215 51.36215
## 4 38.97 51.53 59.69 47.48 47.69 41.69149 41.69149
## 5 41.87 51.35 51.50 47.59 45.77 40.71145 40.71145
## 6 38.71 39.60 43.63 48.34 42.06 35.61699 35.61699
LETS REMOVE pred_usingpredicted
COLUMN
per_index$pred_usingpredicted <- NULL
CALCULATE RESIDUALS USING residual()
per_index$residuals <- residuals(per_index_model)
CALCULATE ROOT MEAN SQUARE ERROR RMSE
rmse <- sqrt(mean(per_index$residuals ** 2)) ; rmse
## [1] 3.282114
LET US CHECK IF RESIDUALS ARE NORMALLY DISTRIBUTED
shapiro.test(per_index$residuals)
##
## Shapiro-Wilk normality test
##
## data: per_index$residuals
## W = 0.95264, p-value = 0.1588
plot(per_index$residuals)
TEST FOR MULTI-COLLINEARITY : variance inflation factor, Is vif < 5
library(car)
## Loading required package: carData
library(carData)
vif(per_index_model)
## aptitude technical general
## 1.067857 1.945283 2.010262
DURBIN WATSON’S TEST TO CHECK AUTO-CORELATION OF RESIDUALS lag of 1st order residual should not be auto corelated \[d = 2(1-r), 0 \leq d \leq 4\]
where r is pearsons coefficient of corelation
d = 0
if perfectly auto corelated d = 2
if uncorelated d = 4
if perfectly negatively related
HO: there is no auto corelation in 1 st order lag of residual
HA: there exist auto corelation in 1 st order lag of residual
durbinWatsonTest(per_index_model)
## lag Autocorrelation D-W Statistic p-value
## 1 0.259679 1.391532 0.072
## Alternative hypothesis: rho != 0
Check for p-value. If p-value of test is greater than alpha
which we set to 0.05
, then we fail to reject HO.