# Build a prediction model for Salary_hike
ye.sh <- read.csv(file.choose()) # choose the Salary_Data data set
View(ye.sh)
# 30 Observations of 2 variables
# Scatter Diagram (Plot x,y)
plot(ye.sh$YearsExperience,ye.sh$Salary)

# Other Exploratory data analysis and Plots
boxplot(ye.sh)

hist(ye.sh$YearsExperience)

hist(ye.sh$Salary)

summary(ye.sh)
## YearsExperience Salary
## Min. : 1.100 Min. : 37731
## 1st Qu.: 3.200 1st Qu.: 56721
## Median : 4.700 Median : 65237
## Mean : 5.313 Mean : 76003
## 3rd Qu.: 7.700 3rd Qu.:100545
## Max. :10.500 Max. :122391
# Correlation coefficient value for Years of Experience and Employee Salary Hike
ye<- ye.sh$YearsExperience
sh <- ye.sh$Salary
cor(ye,sh)
## [1] 0.9782416
# If |r| is greater than 0.85 then Co-relation is Strong(Correlation Co-efficient = 0.9782416).
# This has a strong Positive Correlation
# Simple model without using any transformation
reg<-lm(sh~ye)
summary(reg)
##
## Call:
## lm(formula = sh ~ ye)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7958.0 -4088.5 -459.9 3372.6 11448.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25792.2 2273.1 11.35 5.51e-12 ***
## ye 9450.0 378.8 24.95 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5788 on 28 degrees of freedom
## Multiple R-squared: 0.957, Adjusted R-squared: 0.9554
## F-statistic: 622.5 on 1 and 28 DF, p-value: < 2.2e-16
# Probability value should be less than 0.05(5.51e-12)
# The multiple-R-Squared Value is 0.957 which is greater than 0.8(In General)
# Adjusted R-Squared Value is 0.9554
# The Probability Value for F-Statistic is 2.2e-16(Overall Probability Model is also less than 0.05)
confint(reg,level = 0.95) # confidence interval
## 2.5 % 97.5 %
## (Intercept) 21136.061 30448.34
## ye 8674.119 10225.81
# The above code will get you 2 equations
# 1 to caliculate the lower range and other for upper range
# Function to Predict the above model
predict(reg,interval="predict")
## Warning in predict.lm(reg, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 36187.16 23698.92 48675.40
## 2 38077.15 25628.63 50525.67
## 3 39967.14 27556.52 52377.76
## 4 44692.12 32368.22 57016.03
## 5 46582.12 34289.64 58874.59
## 6 53197.09 40999.70 65394.48
## 7 54142.09 41956.37 66327.80
## 8 56032.08 43868.25 68195.91
## 9 56032.08 43868.25 68195.91
## 10 60757.06 48639.42 72874.70
## 11 62647.05 50544.46 74749.65
## 12 63592.05 51496.24 75687.86
## 13 63592.05 51496.24 75687.86
## 14 64537.05 52447.52 76626.57
## 15 68317.03 56247.70 80386.36
## 16 72097.02 60039.93 84154.10
## 17 73987.01 61933.05 86040.96
## 18 75877.00 63824.18 87929.82
## 19 81546.98 69485.57 93608.39
## 20 82491.97 70427.39 94556.56
## 21 90051.94 77944.06 102159.83
## 22 92886.93 80754.66 105019.20
## 23 100446.90 88228.15 112665.65
## 24 103281.89 91022.76 115541.02
## 25 108006.87 95670.98 120342.77
## 26 110841.86 98454.30 123229.42
## 27 115566.84 103084.00 128049.68
## 28 116511.84 104008.59 129015.09
## 29 123126.81 110468.27 135785.35
## 30 125016.80 112309.98 137723.63
# predict(reg,type="prediction")
# Adjusted R-squared value for the above model is 0.9554
# we may have to do transformation of variables for better R-squared value
# Applying transformations
# Logarthmic transformation
reg_log<-lm(sh~log(ye)) # Regression using logarthmic transformation
summary(reg_log)
##
## Call:
## lm(formula = sh ~ log(ye))
##
## Residuals:
## Min 1Q Median 3Q Max
## -15392.6 -7523.0 559.7 6336.1 20629.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14928 5156 2.895 0.00727 **
## log(ye) 40582 3172 12.792 3.25e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10660 on 28 degrees of freedom
## Multiple R-squared: 0.8539, Adjusted R-squared: 0.8487
## F-statistic: 163.6 on 1 and 28 DF, p-value: 3.25e-13
confint(reg_log,level=0.95)
## 2.5 % 97.5 %
## (Intercept) 4365.921 25490.02
## log(ye) 34083.512 47080.46
predict(reg_log,interval="predict")
## Warning in predict.lm(reg_log, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 18795.85 -5225.823 42817.52
## 2 25575.24 1946.237 49204.23
## 3 31382.55 8054.979 54710.13
## 4 43057.26 20232.824 65881.70
## 5 46925.14 24235.859 69614.42
## 6 58136.05 35746.140 80525.96
## 7 59511.84 37149.017 81874.67
## 8 62130.94 39813.758 84448.13
## 9 62130.94 39813.758 84448.13
## 10 68022.72 45779.622 90265.82
## 11 70159.11 47933.039 92385.17
## 12 71186.55 48966.805 93406.30
## 13 71186.55 48966.805 93406.30
## 14 72188.63 49973.872 94403.38
## 15 75966.42 53760.064 98172.78
## 16 79422.30 57209.189 101635.40
## 17 81045.79 58824.757 103266.83
## 18 82606.83 60375.307 104838.35
## 19 86959.07 64683.513 109234.62
## 20 87641.13 65356.711 109925.56
## 21 92720.50 70353.368 115087.64
## 22 94472.51 72070.071 116874.96
## 23 98805.37 76300.817 121309.93
## 24 100317.92 77772.799 122863.04
## 25 102719.92 80105.200 125334.64
## 26 104095.71 81438.281 126753.14
## 27 106289.87 83560.068 129019.67
## 28 106714.81 83970.397 129459.23
## 29 109571.01 86723.331 132418.68
## 30 110351.45 87474.053 133228.86
# Multiple R-squared value for the above model is 0.8539
# Adjusted R-squared: 0.8487
# we may have to do different transformation for a better R-squared value
# Applying different transformations
# Exponential model
reg_exp<-lm(log(sh)~ye) # regression using Exponential model
summary(reg_exp)
##
## Call:
## lm(formula = log(sh) ~ ye)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.18949 -0.06946 -0.01068 0.06932 0.19029
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.507402 0.038443 273.33 <2e-16 ***
## ye 0.125453 0.006406 19.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09789 on 28 degrees of freedom
## Multiple R-squared: 0.932, Adjusted R-squared: 0.9295
## F-statistic: 383.6 on 1 and 28 DF, p-value: < 2.2e-16
confint(reg_exp,level=0.95)
## 2.5 % 97.5 %
## (Intercept) 10.4286558 10.5861480
## ye 0.1123316 0.1385742
exp(predict(reg_exp,interval="predict"))
## Warning in predict.lm(reg_exp, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 41998.96 34002.70 51875.66
## 2 43066.07 34890.07 53158.00
## 3 44160.29 35799.49 54473.71
## 4 47019.03 38172.93 57915.09
## 5 48213.69 39163.64 59355.04
## 6 52639.14 42827.23 64699.01
## 7 53303.68 43376.45 65502.87
## 8 54658.01 44495.02 67142.31
## 9 54658.01 44495.02 67142.31
## 10 58196.33 47412.46 71432.97
## 11 59674.98 48629.49 73229.31
## 12 60428.34 49249.05 74145.27
## 13 60428.34 49249.05 74145.27
## 14 61191.20 49876.09 75073.32
## 15 64340.20 52460.71 78909.75
## 16 67651.25 55171.84 82953.39
## 17 69370.13 56576.64 85056.57
## 18 71132.68 58015.25 87216.02
## 19 76693.63 62541.63 94047.97
## 20 77661.84 63327.77 95240.38
## 21 85860.70 69962.12 105372.17
## 22 89153.73 72615.43 109458.64
## 23 98565.81 80164.21 121191.48
## 24 102346.11 83181.93 125925.51
## 25 108971.54 88451.84 134251.55
## 26 113150.93 91764.03 139522.36
## 27 120475.82 97547.11 148793.98
## 28 121996.74 98744.48 150724.43
## 29 133194.62 107525.31 164991.92
## 30 136578.83 110167.31 169322.25
# Multiple R-squared value - 0.932
# Adjusted R SQuare Value - 0.9295
# Higher the R-sqaured value - Better chances of getting good model
# for Salary hike and Years of Experience
# Quadratic model
ye.sh[,"ye_sq"] = ye*ye
# Quadratic model
quad_mod <- lm(sh~ye+I(ye^2),data=ye.sh)
summary(quad_mod)
##
## Call:
## lm(formula = sh ~ ye + I(ye^2), data = ye.sh)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7835 -4026 -493 3309 11579
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26214.93 4554.67 5.756 4.04e-06 ***
## ye 9259.28 1811.01 5.113 2.25e-05 ***
## I(ye^2) 16.39 152.12 0.108 0.915
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5893 on 27 degrees of freedom
## Multiple R-squared: 0.957, Adjusted R-squared: 0.9538
## F-statistic: 300.3 on 2 and 27 DF, p-value: < 2.2e-16
confint(quad_mod,level=0.95)
## 2.5 % 97.5 %
## (Intercept) 16869.5127 35560.3527
## ye 5543.4050 12975.1628
## I(ye^2) -295.7344 328.5195
predict(quad_mod,interval="predict")
## Warning in predict.lm(quad_mod, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 36419.98 22934.59 49905.37
## 2 38279.71 25011.39 51548.02
## 3 40140.74 27059.48 53222.00
## 4 44799.07 32066.85 57531.29
## 5 46664.70 34030.20 59299.19
## 6 53204.72 40764.58 65644.86
## 7 54140.32 41712.88 66567.76
## 8 56012.50 43601.83 68423.18
## 9 56012.50 43601.83 68423.18
## 10 60698.70 48290.87 73106.53
## 11 62575.47 50157.82 74993.12
## 12 63514.35 51090.24 75938.46
## 13 63514.35 51090.24 75938.46
## 14 64453.56 52022.20 76884.91
## 15 68213.66 55748.59 80678.73
## 16 71979.01 59479.19 84478.83
## 17 73863.65 61348.25 86379.05
## 18 75749.60 63220.68 88278.53
## 19 81415.33 68861.91 93968.76
## 20 82360.77 69805.84 94915.70
## 21 89936.06 77392.43 102479.68
## 22 92782.20 80249.63 105314.77
## 23 100386.34 87872.00 112900.67
## 24 103243.30 90719.45 115767.14
## 25 108011.46 95430.60 120592.31
## 26 110876.29 98226.05 123526.52
## 27 115657.56 102810.50 128504.62
## 28 116614.80 103713.75 129515.84
## 29 123324.64 109876.71 136772.58
## 30 125244.69 111578.68 138910.71
# Adjusted R-Squared = 0.9538
#Multiple R -Squared Value = 0.957
# Quadratic model
qd_model <- lm(sh~ye+ye_sq,data=ye.sh)
summary(qd_model)
##
## Call:
## lm(formula = sh ~ ye + ye_sq, data = ye.sh)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7835 -4026 -493 3309 11579
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26214.93 4554.67 5.756 4.04e-06 ***
## ye 9259.28 1811.01 5.113 2.25e-05 ***
## ye_sq 16.39 152.12 0.108 0.915
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5893 on 27 degrees of freedom
## Multiple R-squared: 0.957, Adjusted R-squared: 0.9538
## F-statistic: 300.3 on 2 and 27 DF, p-value: < 2.2e-16
confint(quad_mod,level=0.95)
## 2.5 % 97.5 %
## (Intercept) 16869.5127 35560.3527
## ye 5543.4050 12975.1628
## I(ye^2) -295.7344 328.5195
predict(quad_mod,interval="predict")
## Warning in predict.lm(quad_mod, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 36419.98 22934.59 49905.37
## 2 38279.71 25011.39 51548.02
## 3 40140.74 27059.48 53222.00
## 4 44799.07 32066.85 57531.29
## 5 46664.70 34030.20 59299.19
## 6 53204.72 40764.58 65644.86
## 7 54140.32 41712.88 66567.76
## 8 56012.50 43601.83 68423.18
## 9 56012.50 43601.83 68423.18
## 10 60698.70 48290.87 73106.53
## 11 62575.47 50157.82 74993.12
## 12 63514.35 51090.24 75938.46
## 13 63514.35 51090.24 75938.46
## 14 64453.56 52022.20 76884.91
## 15 68213.66 55748.59 80678.73
## 16 71979.01 59479.19 84478.83
## 17 73863.65 61348.25 86379.05
## 18 75749.60 63220.68 88278.53
## 19 81415.33 68861.91 93968.76
## 20 82360.77 69805.84 94915.70
## 21 89936.06 77392.43 102479.68
## 22 92782.20 80249.63 105314.77
## 23 100386.34 87872.00 112900.67
## 24 103243.30 90719.45 115767.14
## 25 108011.46 95430.60 120592.31
## 26 110876.29 98226.05 123526.52
## 27 115657.56 102810.50 128504.62
## 28 116614.80 103713.75 129515.84
## 29 123324.64 109876.71 136772.58
## 30 125244.69 111578.68 138910.71
# Adjusted R-Squared = 0.9538
#Multiple R -Squared Value = 0.957
# Cubic model
poly_mod <- lm(sh~ye+I(ye^2)+I(ye^3),data=ye.sh)
summary(poly_mod) # 0.9636
##
## Call:
## lm(formula = sh ~ ye + I(ye^2) + I(ye^3), data = ye.sh)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7468 -4286 -1100 2639 10412
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38863.07 7214.75 5.387 1.21e-05 ***
## ye -718.71 4892.11 -0.147 0.8843
## I(ye^2) 2099.35 968.36 2.168 0.0395 *
## I(ye^3) -122.92 56.52 -2.175 0.0389 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5524 on 26 degrees of freedom
## Multiple R-squared: 0.9636, Adjusted R-squared: 0.9594
## F-statistic: 229.4 on 3 and 26 DF, p-value: < 2.2e-16
confint(poly_mod,level=0.95)
## 2.5 % 97.5 %
## (Intercept) 24032.9470 53693.19673
## ye -10774.5874 9337.17053
## I(ye^2) 108.8691 4089.83482
## I(ye^3) -239.0934 -6.73744
predict(poly_mod,interval="predict")
## Warning in predict.lm(poly_mod, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 40449.11 27225.47 53672.74
## 2 41206.61 28443.63 53969.59
## 3 42093.71 29671.89 54515.53
## 4 44839.74 32883.55 56795.93
## 5 46133.97 34259.01 58008.93
## 6 51436.58 39635.79 63237.38
## 7 52282.40 40481.08 64083.71
## 8 54032.88 42229.44 65836.31
## 9 54032.88 42229.44 65836.31
## 10 58717.94 46916.97 70518.91
## 11 60700.03 48905.33 72494.74
## 12 61711.28 49920.66 73501.90
## 13 61711.28 49920.66 73501.90
## 14 62735.02 50948.94 74521.10
## 15 66940.09 55173.09 78707.10
## 16 71285.97 59529.80 83042.13
## 17 73496.95 61739.31 85254.59
## 18 75725.44 63960.18 87490.69
## 19 82456.89 70627.62 94286.16
## 20 83577.76 71732.13 95423.39
## 21 92401.35 80394.05 104408.65
## 22 95595.79 83530.42 107661.17
## 23 103603.74 91465.11 115742.37
## 24 106358.46 94235.02 118481.90
## 25 110570.09 98511.11 122629.07
## 26 112836.87 100814.05 124859.68
## 27 116117.25 104045.46 128189.04
## 28 116692.05 104577.17 128806.94
## 29 119867.63 106823.53 132911.74
## 30 120480.23 106880.01 134080.44
# Adjusted R-Squared = 0.9594
#Multiple R -Squared Value = 0.9636
model_R_Squared_values <- list(model=NULL,R_squared=NULL)
model_R_Squared_values[["model"]] <- c("reg","reg_log","reg_exp","quad_mod","poly_mod")
model_R_Squared_values[["R_squared"]] <- c(0.9554,0.8487,0.9295,0.9538,0.9594)
Final <- cbind(model_R_Squared_values[["model"]],model_R_Squared_values[["R_squared"]])
View(model_R_Squared_values)
View(Final)
# Cubic model gives the best Adjusted R-Squared value
predicted_Value <- predict(poly_mod)
predicted_Value
## 1 2 3 4 5 6 7
## 40449.11 41206.61 42093.71 44839.74 46133.97 51436.58 52282.40
## 8 9 10 11 12 13 14
## 54032.88 54032.88 58717.94 60700.03 61711.28 61711.28 62735.02
## 15 16 17 18 19 20 21
## 66940.09 71285.97 73496.95 75725.44 82456.89 83577.76 92401.35
## 22 23 24 25 26 27 28
## 95595.79 103603.74 106358.46 110570.09 112836.87 116117.25 116692.05
## 29 30
## 119867.63 120480.23
Final <- cbind(YearsofExp=ye.sh$YearsExperience,Sal_Hike = ye.sh$Salary,Pred_sal_hike=predicted_Value)
View(Final)
rmse<-sqrt(mean((predicted_Value-sh)^2))
rmse
## [1] 5142.642
plot(poly_mod)




hist(residuals(poly_mod)) # close to normal distribution
