Simple Linear Regression

Employee Churn Dataset

Assignment 10

mydata <- read.csv("C:\\Users\\RISHI RAHUL\\Desktop\\DS\\2 SLR\\Assignment\\emp_data.csv")

colnames(mydata)
## [1] "Salary_hike"    "Churn_out_rate"
attach(mydata)

# 1st, 2nd, 3rd, 4th Business Decision Model
summary(mydata)
##   Salary_hike   Churn_out_rate 
##  Min.   :1580   Min.   :60.00  
##  1st Qu.:1618   1st Qu.:65.75  
##  Median :1675   Median :71.00  
##  Mean   :1689   Mean   :72.90  
##  3rd Qu.:1724   3rd Qu.:78.75  
##  Max.   :1870   Max.   :92.00
var(Salary_hike)
## [1] 8481.822
sd(Churn_out_rate)
## [1] 10.25725
library(e1071)
skewness(Salary_hike)
## [1] 0.6180303
kurtosis(Churn_out_rate)
## [1] -1.162193
# Plot
boxplot(Salary_hike)

boxplot(Salary_hike, horizontal = TRUE)

hist(Churn_out_rate)

# Checking the Normality or Linearity of distribution
qqnorm(Salary_hike)

plot(mydata)

# Determining the Correlation of Emp Dataset
cor(mydata)
##                Salary_hike Churn_out_rate
## Salary_hike      1.0000000     -0.9117216
## Churn_out_rate  -0.9117216      1.0000000
# Preparing a Simple Linear Regression model
model <- lm(Churn_out_rate ~ Salary_hike)
summary(model) # R- squared value is 0.8312
## 
## Call:
## lm(formula = Churn_out_rate ~ Salary_hike)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.804 -3.059 -1.819  2.430  8.072 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 244.36491   27.35194   8.934 1.96e-05 ***
## Salary_hike  -0.10154    0.01618  -6.277 0.000239 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.469 on 8 degrees of freedom
## Multiple R-squared:  0.8312, Adjusted R-squared:  0.8101 
## F-statistic:  39.4 on 1 and 8 DF,  p-value: 0.0002386
# Logarthmic transformation
model_log <- lm(Churn_out_rate ~ log(Salary_hike))
summary(model_log)  # R- squared value is 0.8486
## 
## Call:
## lm(formula = Churn_out_rate ~ log(Salary_hike))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.678 -2.851 -1.794  2.275  7.624 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1381.5      195.4   7.070 0.000105 ***
## log(Salary_hike)   -176.1       26.3  -6.697 0.000153 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.233 on 8 degrees of freedom
## Multiple R-squared:  0.8486, Adjusted R-squared:  0.8297 
## F-statistic: 44.85 on 1 and 8 DF,  p-value: 0.0001532
# Exponential model 
model_exp <- lm(log(Churn_out_rate) ~ Salary_hike)
summary(model_exp)  # R- squared value is 0.8735
## 
## Call:
## lm(formula = log(Churn_out_rate) ~ Salary_hike)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.04825 -0.03519 -0.01909  0.02942  0.08970 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.6383000  0.3175983  20.902 2.88e-08 ***
## Salary_hike -0.0013963  0.0001878  -7.434 7.38e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0519 on 8 degrees of freedom
## Multiple R-squared:  0.8735, Adjusted R-squared:  0.8577 
## F-statistic: 55.26 on 1 and 8 DF,  p-value: 7.377e-05
# Log on both side
model_log2 <- lm(log(Churn_out_rate) ~ log(Salary_hike))
summary(model_log2)  # R- squared value is 0.8891
## 
## Call:
## lm(formula = log(Churn_out_rate) ~ log(Salary_hike))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.04433 -0.03234 -0.01865  0.02737  0.08377 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       22.2472     2.2436   9.916 9.04e-06 ***
## log(Salary_hike)  -2.4180     0.3019  -8.008 4.33e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0486 on 8 degrees of freedom
## Multiple R-squared:  0.8891, Adjusted R-squared:  0.8752 
## F-statistic: 64.13 on 1 and 8 DF,  p-value: 4.335e-05
confint(model_log2,level=0.95)
##                      2.5 %    97.5 %
## (Intercept)      17.073481 27.420881
## log(Salary_hike) -3.114298 -1.721744
predict(model_log2,interval="predict")
## Warning in predict.lm(model_log2, interval = "predict"): predictions on current data refer to _future_ responses
##         fit      lwr      upr
## 1  4.438020 4.312022 4.564019
## 2  4.407605 4.284489 4.530720
## 3  4.392539 4.270643 4.514435
## 4  4.347897 4.228759 4.467036
## 5  4.318588 4.200531 4.436645
## 6  4.275279 4.157724 4.392833
## 7  4.252494 4.134673 4.370314
## 8  4.218714 4.099832 4.337596
## 9  4.122803 3.996797 4.248809
## 10 4.030551 3.892729 4.168373
# Model_log2 gives the best R-Squared value
model_final <- predict(model_log2)
model_final
##        1        2        3        4        5        6        7        8 
## 4.438020 4.407605 4.392539 4.347897 4.318588 4.275279 4.252494 4.218714 
##        9       10 
## 4.122803 4.030551
rmse <- sqrt(mean((model_final-Churn_out_rate)^2))
rmse
## [1] 69.29027
plot(model_log2)

hist(residuals(model_log2)) 

Taking Log on both side Model is predicted with an accuracy of 88.91%