Simple Linear Regression
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.1
Emp_Data <- read.csv("E:\\Data Science\\data science\\assignments\\Simple Linear Regression\\emp_data.csv")
attach(Emp_Data)
# First Moment Business Decision
summary(Emp_Data)
## Salary_hike Churn_out_rate
## Min. :1580 Min. :60.00
## 1st Qu.:1618 1st Qu.:65.75
## Median :1675 Median :71.00
## Mean :1689 Mean :72.90
## 3rd Qu.:1724 3rd Qu.:78.75
## Max. :1870 Max. :92.00
# Second Moment Business Decision
var(Salary_hike)
## [1] 8481.822
var(Churn_out_rate)
## [1] 105.2111
sd(Salary_hike)
## [1] 92.09681
sd(Churn_out_rate)
## [1] 10.25725
# Third Moment Business Decision
skewness(Salary_hike)
## [1] 0.6180303
skewness(Churn_out_rate)
## [1] 0.466011
# Fourth Moment Business decision
kurtosis(Salary_hike)
## [1] -0.9358547
kurtosis(Churn_out_rate)
## [1] -1.162193
plot(Churn_out_rate, Salary_hike, col = "blue")

# Corelation coefficient
cor(Churn_out_rate, Salary_hike)
## [1] -0.9117216
model1 <- lm(Salary_hike~Churn_out_rate)
summary(model1)
##
## Call:
## lm(formula = Salary_hike ~ Churn_out_rate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.97 -23.13 -21.41 19.24 75.80
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2285.365 95.912 23.828 1.02e-08 ***
## Churn_out_rate -8.186 1.304 -6.277 0.000239 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.13 on 8 degrees of freedom
## Multiple R-squared: 0.8312, Adjusted R-squared: 0.8101
## F-statistic: 39.4 on 1 and 8 DF, p-value: 0.0002386
confint(model1, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) 2064.19292 2506.537671
## Churn_out_rate -11.19332 -5.178839
predict(model1, interval = "predict")
## Warning in predict.lm(model1, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 1532.246 1419.468 1645.023
## 2 1589.548 1485.897 1693.200
## 3 1630.479 1531.103 1729.854
## 4 1671.409 1574.149 1768.669
## 5 1695.967 1598.875 1793.060
## 6 1712.340 1614.894 1809.785
## 7 1728.712 1630.545 1826.879
## 8 1753.270 1653.350 1853.190
## 9 1777.828 1675.388 1880.269
## 10 1794.200 1689.680 1898.721
# R-squared value for the above model is 0.8312.
# Residual standard error is 40.13 on 8 degrees of freedom.
# Apply different transformations
model2 <- lm(log(Salary_hike)~log(Churn_out_rate))
summary(model2)
##
## Call:
## lm(formula = log(Salary_hike) ~ log(Churn_out_rate))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.017164 -0.012812 -0.009309 0.008381 0.034919
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.00424 0.19663 45.794 5.71e-11 ***
## log(Churn_out_rate) -0.36769 0.04591 -8.008 4.33e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01895 on 8 degrees of freedom
## Multiple R-squared: 0.8891, Adjusted R-squared: 0.8752
## F-statistic: 64.13 on 1 and 8 DF, p-value: 4.335e-05
confint(model2, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) 8.5508229 9.4576615
## log(Churn_out_rate) -0.4735731 -0.2618155
predict(model2, interval = "predict")
## Warning in predict.lm(model2, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 7.341606 7.289128 7.394085
## 2 7.370705 7.321756 7.419654
## 3 7.392996 7.345914 7.440078
## 4 7.416727 7.370722 7.462731
## 5 7.431737 7.385898 7.477575
## 6 7.442095 7.396133 7.488057
## 7 7.452753 7.406464 7.499042
## 8 7.469344 7.422151 7.516537
## 9 7.486718 7.438092 7.535345
## 10 7.498775 7.448882 7.548668
# R-squared value for the above model is 0.8891 which is better than model1.
# Residual standard error is reduced to 0.01895 on 8 degrees of freedom.