Assignment 10
mydata <- read.csv("C:\\Users\\RISHI RAHUL\\Desktop\\DS\\2 SLR\\Assignment\\emp_data.csv")
colnames(mydata)
## [1] "Salary_hike" "Churn_out_rate"
attach(mydata)
# 1st, 2nd, 3rd, 4th Business Decision Model
summary(mydata)
## Salary_hike Churn_out_rate
## Min. :1580 Min. :60.00
## 1st Qu.:1618 1st Qu.:65.75
## Median :1675 Median :71.00
## Mean :1689 Mean :72.90
## 3rd Qu.:1724 3rd Qu.:78.75
## Max. :1870 Max. :92.00
var(Salary_hike)
## [1] 8481.822
sd(Churn_out_rate)
## [1] 10.25725
library(e1071)
skewness(Salary_hike)
## [1] 0.6180303
kurtosis(Churn_out_rate)
## [1] -1.162193
# Plot
boxplot(Salary_hike)

boxplot(Salary_hike, horizontal = TRUE)

hist(Churn_out_rate)

# Checking the Normality or Linearity of distribution
qqnorm(Salary_hike)

plot(mydata)

# Determining the Correlation of Emp Dataset
cor(mydata)
## Salary_hike Churn_out_rate
## Salary_hike 1.0000000 -0.9117216
## Churn_out_rate -0.9117216 1.0000000
# Preparing a Simple Linear Regression model
model <- lm(Churn_out_rate ~ Salary_hike)
summary(model) # R- squared value is 0.8312
##
## Call:
## lm(formula = Churn_out_rate ~ Salary_hike)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.804 -3.059 -1.819 2.430 8.072
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 244.36491 27.35194 8.934 1.96e-05 ***
## Salary_hike -0.10154 0.01618 -6.277 0.000239 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.469 on 8 degrees of freedom
## Multiple R-squared: 0.8312, Adjusted R-squared: 0.8101
## F-statistic: 39.4 on 1 and 8 DF, p-value: 0.0002386
# Logarthmic transformation
model_log <- lm(Churn_out_rate ~ log(Salary_hike))
summary(model_log) # R- squared value is 0.8486
##
## Call:
## lm(formula = Churn_out_rate ~ log(Salary_hike))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.678 -2.851 -1.794 2.275 7.624
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1381.5 195.4 7.070 0.000105 ***
## log(Salary_hike) -176.1 26.3 -6.697 0.000153 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.233 on 8 degrees of freedom
## Multiple R-squared: 0.8486, Adjusted R-squared: 0.8297
## F-statistic: 44.85 on 1 and 8 DF, p-value: 0.0001532
# Exponential model
model_exp <- lm(log(Churn_out_rate) ~ Salary_hike)
summary(model_exp) # R- squared value is 0.8735
##
## Call:
## lm(formula = log(Churn_out_rate) ~ Salary_hike)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.04825 -0.03519 -0.01909 0.02942 0.08970
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.6383000 0.3175983 20.902 2.88e-08 ***
## Salary_hike -0.0013963 0.0001878 -7.434 7.38e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0519 on 8 degrees of freedom
## Multiple R-squared: 0.8735, Adjusted R-squared: 0.8577
## F-statistic: 55.26 on 1 and 8 DF, p-value: 7.377e-05
# Log on both side
model_log2 <- lm(log(Churn_out_rate) ~ log(Salary_hike))
summary(model_log2) # R- squared value is 0.8891
##
## Call:
## lm(formula = log(Churn_out_rate) ~ log(Salary_hike))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.04433 -0.03234 -0.01865 0.02737 0.08377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.2472 2.2436 9.916 9.04e-06 ***
## log(Salary_hike) -2.4180 0.3019 -8.008 4.33e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0486 on 8 degrees of freedom
## Multiple R-squared: 0.8891, Adjusted R-squared: 0.8752
## F-statistic: 64.13 on 1 and 8 DF, p-value: 4.335e-05
confint(model_log2,level=0.95)
## 2.5 % 97.5 %
## (Intercept) 17.073481 27.420881
## log(Salary_hike) -3.114298 -1.721744
predict(model_log2,interval="predict")
## Warning in predict.lm(model_log2, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 4.438020 4.312022 4.564019
## 2 4.407605 4.284489 4.530720
## 3 4.392539 4.270643 4.514435
## 4 4.347897 4.228759 4.467036
## 5 4.318588 4.200531 4.436645
## 6 4.275279 4.157724 4.392833
## 7 4.252494 4.134673 4.370314
## 8 4.218714 4.099832 4.337596
## 9 4.122803 3.996797 4.248809
## 10 4.030551 3.892729 4.168373
# Model_log2 gives the best R-Squared value
model_final <- predict(model_log2)
model_final
## 1 2 3 4 5 6 7 8
## 4.438020 4.407605 4.392539 4.347897 4.318588 4.275279 4.252494 4.218714
## 9 10
## 4.122803 4.030551
rmse <- sqrt(mean((model_final-Churn_out_rate)^2))
rmse
## [1] 69.29027
plot(model_log2)




hist(residuals(model_log2))
