# Build a prediction model for Churn_out_rate 

sh.cr <- read.csv(file.choose()) # choose the Emp_Data.csv data set
View(sh.cr)

# 10 Observations of 2 variables

# Scatter Diagram (Plot x,y)
plot(sh.cr$Salary_hike,sh.cr$Churn_out_rate)

# Other Exploratory data analysis and Plots

boxplot(sh.cr)

hist(sh.cr$Salary_hike)

hist(sh.cr$Churn_out_rate)

summary(sh.cr)

##   Salary_hike   Churn_out_rate 
##  Min.   :1580   Min.   :60.00  
##  1st Qu.:1618   1st Qu.:65.75  
##  Median :1675   Median :71.00  
##  Mean   :1689   Mean   :72.90  
##  3rd Qu.:1724   3rd Qu.:78.75  
##  Max.   :1870   Max.   :92.00

# Correlation coefficient value for Salary Hike and Churn_out_Date
cr<- sh.cr$Churn_out_rate
sh <- sh.cr$Salary_hike
cor(cr,sh)

## [1] -0.9117216

# If |r| is greater than  0.85 then Co-relation is Strong(Correlation Co-efficient = -0.9117216). 
# This has a strong negative Correlation 

# Simple model without using any transformation
reg<-lm(cr~sh)
summary(reg)

## 
## Call:
## lm(formula = cr ~ sh)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.804 -3.059 -1.819  2.430  8.072 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 244.36491   27.35194   8.934 1.96e-05 ***
## sh           -0.10154    0.01618  -6.277 0.000239 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.469 on 8 degrees of freedom
## Multiple R-squared:  0.8312, Adjusted R-squared:  0.8101 
## F-statistic:  39.4 on 1 and 8 DF,  p-value: 0.0002386

# Probability value should be less than 0.05(1.96e-05)
# The multiple-R-Squared Value is 0.8312 which is greater than 0.8(In General)
# Adjusted R-Squared Value is 0.8101 
# The Probability Value for F-Statistic is 0.0002386(Overall Probability Model is also less than 0.05)
confint(reg,level = 0.95) # confidence interval

##                   2.5 %      97.5 %
## (Intercept) 181.2912317 307.4385905
## sh           -0.1388454  -0.0642399

# The above code will get you 2 equations 
# 1 to caliculate the lower range and other for upper range

# Function to Predict the above model 
predict(reg,interval="predict")

## Warning in predict.lm(reg, interval = "predict"): predictions on current data refer to _future_ responses

##         fit      lwr      upr
## 1  83.92753 72.38391 95.47115
## 2  81.89668 70.59327 93.20009
## 3  80.88125 69.68123 92.08127
## 4  77.83497 66.87456 88.79538
## 5  75.80412 64.94216 86.66607
## 6  72.75784 61.94828 83.56740
## 7  71.13316 60.30425 81.96206
## 8  68.69613 57.77694 79.61533
## 9  61.58815 50.00746 73.16884
## 10 54.48016 41.72742 67.23290

# predict(reg,type="prediction")
# Adjusted R-squared value for the above model is 0.8101 

# we may have to do transformation of variables for better R-squared value
# Applying transformations

# Logarthmic transformation
reg_log<-lm(cr~log(sh))  # Regression using logarthmic transformation
summary(reg_log)

## 
## Call:
## lm(formula = cr ~ log(sh))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.678 -2.851 -1.794  2.275  7.624 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1381.5      195.4   7.070 0.000105 ***
## log(sh)       -176.1       26.3  -6.697 0.000153 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.233 on 8 degrees of freedom
## Multiple R-squared:  0.8486, Adjusted R-squared:  0.8297 
## F-statistic: 44.85 on 1 and 8 DF,  p-value: 0.0001532

confint(reg_log,level=0.95)

##                 2.5 %    97.5 %
## (Intercept)  930.8584 1832.0540
## log(sh)     -236.7512 -115.4682

predict(reg_log,interval="predict")

## Warning in predict.lm(reg_log, interval = "predict"): predictions on current data refer to _future_ responses

##         fit      lwr      upr
## 1  84.37627 73.40258 95.34996
## 2  82.16102 71.43838 92.88366
## 3  81.06376 70.44736 91.68017
## 4  77.81241 67.43614 88.18869
## 5  75.67773 65.39568 85.95978
## 6  72.52344 62.28515 82.76172
## 7  70.86397 60.60253 81.12541
## 8  68.40372 58.04985 78.75760
## 9  61.41829 50.44392 72.39265
## 10 54.69939 42.69592 66.70286

# Multiple R-squared value for the above model is 0.8486
# Adjusted R-squared:  0.8297 

# we may have to do different transformation for a better R-squared value
# Applying different transformations

# Exponential model 
reg_exp<-lm(log(cr)~sh) # regression using Exponential model
summary(reg_exp)

## 
## Call:
## lm(formula = log(cr) ~ sh)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.04825 -0.03519 -0.01909  0.02942  0.08970 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.6383000  0.3175983  20.902 2.88e-08 ***
## sh          -0.0013963  0.0001878  -7.434 7.38e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0519 on 8 degrees of freedom
## Multiple R-squared:  0.8735, Adjusted R-squared:  0.8577 
## F-statistic: 55.26 on 1 and 8 DF,  p-value: 7.377e-05

confint(reg_exp,level=0.95)

##                    2.5 %        97.5 %
## (Intercept)  5.905917079  7.3706828388
## sh          -0.001829477 -0.0009631923

exp(predict(reg_exp,interval="predict"))

## Warning in predict.lm(reg_exp, interval = "predict"): predictions on current data refer to _future_ responses

##         fit      lwr      upr
## 1  84.10710 73.55634 96.17122
## 2  81.79076 71.73037 93.26215
## 3  80.65662 70.82070 91.85860
## 4  77.34770 68.10452 87.84537
## 5  75.21752 66.30465 85.32848
## 6  72.13174 63.62321 81.77813
## 7  70.53808 62.20357 79.98932
## 8  68.21338 60.09051 77.43427
## 9  61.86146 54.07801 70.76517
## 10 56.10101 48.37944 65.05499

# Multiple R-squared value - 0.8735
# Adjusted R SQuare Value - 0.8577 
# Higher the R-sqaured value - Better chances of getting good model 
# for Delivery Time and Sorting Time

# Quadratic model
sh.cr[,"sh_sq"] = sh*sh

# Quadratic model
quad_mod <- lm(cr~sh+I(sh^2),data=sh.cr)
summary(quad_mod)

## 
## Call:
## lm(formula = cr ~ sh + I(sh^2), data = sh.cr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5523 -1.3280  0.3497  0.9029  2.8296 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.647e+03  2.281e+02   7.222 0.000174 ***
## sh          -1.737e+00  2.657e-01  -6.538 0.000322 ***
## I(sh^2)      4.754e-04  7.720e-05   6.158 0.000464 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.886 on 7 degrees of freedom
## Multiple R-squared:  0.9737, Adjusted R-squared:  0.9662 
## F-statistic: 129.6 on 2 and 7 DF,  p-value: 2.949e-06

confint(quad_mod,level=0.95)

##                     2.5 %        97.5 %
## (Intercept)  1.107738e+03  2.186285e+03
## sh          -2.365306e+00 -1.108872e+00
## I(sh^2)      2.928508e-04  6.579259e-04

predict(quad_mod,interval="predict")

## Warning in predict.lm(quad_mod, interval = "predict"): predictions on current data refer to _future_ responses

##         fit      lwr      upr
## 1  89.17035 83.78480 94.55590
## 2  84.66327 79.65803 89.66851
## 3  82.55234 77.66358 87.44111
## 4  76.79003 72.03031 81.54976
## 5  73.42388 68.63567 78.21210
## 6  69.08774 64.20255 73.97292
## 7  67.12501 62.19286 72.05716
## 8  64.63730 59.66195 69.61265
## 9  60.50952 55.48124 65.53780
## 10 61.04055 54.97443 67.10667

# Adjusted R-Squared = 0.9662
#Multiple R -Squared Value = 0.9737

# Quadratic model
qd_model <- lm(cr~sh+sh_sq,data=sh.cr)
summary(qd_model)

## 
## Call:
## lm(formula = cr ~ sh + sh_sq, data = sh.cr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5523 -1.3280  0.3497  0.9029  2.8296 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.647e+03  2.281e+02   7.222 0.000174 ***
## sh          -1.737e+00  2.657e-01  -6.538 0.000322 ***
## sh_sq        4.754e-04  7.720e-05   6.158 0.000464 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.886 on 7 degrees of freedom
## Multiple R-squared:  0.9737, Adjusted R-squared:  0.9662 
## F-statistic: 129.6 on 2 and 7 DF,  p-value: 2.949e-06

confint(quad_mod,level=0.95)

##                     2.5 %        97.5 %
## (Intercept)  1.107738e+03  2.186285e+03
## sh          -2.365306e+00 -1.108872e+00
## I(sh^2)      2.928508e-04  6.579259e-04

predict(quad_mod,interval="predict")

## Warning in predict.lm(quad_mod, interval = "predict"): predictions on current data refer to _future_ responses

##         fit      lwr      upr
## 1  89.17035 83.78480 94.55590
## 2  84.66327 79.65803 89.66851
## 3  82.55234 77.66358 87.44111
## 4  76.79003 72.03031 81.54976
## 5  73.42388 68.63567 78.21210
## 6  69.08774 64.20255 73.97292
## 7  67.12501 62.19286 72.05716
## 8  64.63730 59.66195 69.61265
## 9  60.50952 55.48124 65.53780
## 10 61.04055 54.97443 67.10667

# Adjusted R-Squared = 0.9662 
#Multiple R -Squared Value = 0.9737

# Cubic model
poly_mod <- lm(cr~sh+I(sh^2)+I(sh^3),data=sh.cr)
summary(poly_mod) # 0.9893

## 
## Call:
## lm(formula = cr ~ sh + I(sh^2) + I(sh^3), data = sh.cr)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.06811 -0.49848  0.04253  0.76434  1.49050 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  1.320e+04  3.900e+03   3.384   0.0148 *
## sh          -2.194e+01  6.817e+00  -3.218   0.0182 *
## I(sh^2)      1.223e-02  3.966e-03   3.084   0.0216 *
## I(sh^3)     -2.276e-06  7.679e-07  -2.964   0.0251 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.298 on 6 degrees of freedom
## Multiple R-squared:  0.9893, Adjusted R-squared:  0.984 
## F-statistic: 185.4 on 3 and 6 DF,  p-value: 2.647e-06

confint(poly_mod,level=0.95)

##                     2.5 %        97.5 %
## (Intercept)  3.654531e+03  2.273912e+04
## sh          -3.861930e+01 -5.255724e+00
## I(sh^2)      2.525628e-03  2.193585e-02
## I(sh^3)     -4.154996e-06 -3.971350e-07

predict(poly_mod,interval="predict")

## Warning in predict.lm(poly_mod, interval = "predict"): predictions on current data refer to _future_ responses

##         fit      lwr      upr
## 1  90.86026 86.77985 94.94067
## 2  84.73315 81.16894 88.29736
## 3  82.06811 78.56441 85.57180
## 4  75.50317 71.95163 79.05471
## 5  72.18180 68.62172 75.74188
## 6  68.50950 64.99864 72.02037
## 7  67.08304 63.57116 70.59491
## 8  65.48442 61.87360 69.09525
## 9  62.88304 58.80179 66.96430
## 10 59.69351 55.23357 64.15346

# Adjusted R-Squared = 0.984
#Multiple R -Squared Value = 0.9893

model_R_Squared_values <- list(model=NULL,R_squared=NULL)
model_R_Squared_values[["model"]] <- c("reg","reg_log","reg_exp","quad_mod","poly_mod")
model_R_Squared_values[["R_squared"]] <- c(0.8101,0.8297,0.8577,0.9662,0.984)
Final <- cbind(model_R_Squared_values[["model"]],model_R_Squared_values[["R_squared"]])
View(model_R_Squared_values)
View(Final)

# Cubic  model gives the best Adjusted R-Squared value
predicted_Value <- predict(poly_mod)
predicted_Value

##        1        2        3        4        5        6        7        8 
## 90.86026 84.73315 82.06811 75.50317 72.18180 68.50950 67.08304 65.48442 
##        9       10 
## 62.88304 59.69351

Final <- cbind(Salary_Hike=sh.cr$Salary_hike,Churn_Rate = sh.cr$Churn_out_rate,Pred_Chr_rate=predicted_Value)

View(Final)

rmse<-sqrt(mean((predicted_Value-cr)^2))
rmse

## [1] 1.0052

plot(poly_mod)

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

hist(residuals(poly_mod)) # close to normal distribution