# Build a prediction model for Salary_hike

ye.sh <- read.csv(file.choose()) # choose the Salary_Data data set
View(ye.sh)

# 30 Observations of 2 variables

# Scatter Diagram (Plot x,y)
plot(ye.sh$YearsExperience,ye.sh$Salary)

# Other Exploratory data analysis and Plots

boxplot(ye.sh)

hist(ye.sh$YearsExperience)

hist(ye.sh$Salary)

summary(ye.sh)

##  YearsExperience      Salary      
##  Min.   : 1.100   Min.   : 37731  
##  1st Qu.: 3.200   1st Qu.: 56721  
##  Median : 4.700   Median : 65237  
##  Mean   : 5.313   Mean   : 76003  
##  3rd Qu.: 7.700   3rd Qu.:100545  
##  Max.   :10.500   Max.   :122391

# Correlation coefficient value for Years of Experience and Employee Salary Hike
ye<- ye.sh$YearsExperience
sh <- ye.sh$Salary
cor(ye,sh)

## [1] 0.9782416

# If |r| is greater than  0.85 then Co-relation is Strong(Correlation Co-efficient = 0.9782416). 
# This has a strong Positive Correlation 

# Simple model without using any transformation
reg<-lm(sh~ye)
summary(reg)

## 
## Call:
## lm(formula = sh ~ ye)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7958.0 -4088.5  -459.9  3372.6 11448.0 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25792.2     2273.1   11.35 5.51e-12 ***
## ye            9450.0      378.8   24.95  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5788 on 28 degrees of freedom
## Multiple R-squared:  0.957,  Adjusted R-squared:  0.9554 
## F-statistic: 622.5 on 1 and 28 DF,  p-value: < 2.2e-16

# Probability value should be less than 0.05(5.51e-12)
# The multiple-R-Squared Value is 0.957 which is greater than 0.8(In General)
# Adjusted R-Squared Value is 0.9554 
# The Probability Value for F-Statistic is 2.2e-16(Overall Probability Model is also less than 0.05)
confint(reg,level = 0.95) # confidence interval

##                 2.5 %   97.5 %
## (Intercept) 21136.061 30448.34
## ye           8674.119 10225.81

# The above code will get you 2 equations 
# 1 to caliculate the lower range and other for upper range

# Function to Predict the above model 
predict(reg,interval="predict")

## Warning in predict.lm(reg, interval = "predict"): predictions on current data refer to _future_ responses

##          fit       lwr       upr
## 1   36187.16  23698.92  48675.40
## 2   38077.15  25628.63  50525.67
## 3   39967.14  27556.52  52377.76
## 4   44692.12  32368.22  57016.03
## 5   46582.12  34289.64  58874.59
## 6   53197.09  40999.70  65394.48
## 7   54142.09  41956.37  66327.80
## 8   56032.08  43868.25  68195.91
## 9   56032.08  43868.25  68195.91
## 10  60757.06  48639.42  72874.70
## 11  62647.05  50544.46  74749.65
## 12  63592.05  51496.24  75687.86
## 13  63592.05  51496.24  75687.86
## 14  64537.05  52447.52  76626.57
## 15  68317.03  56247.70  80386.36
## 16  72097.02  60039.93  84154.10
## 17  73987.01  61933.05  86040.96
## 18  75877.00  63824.18  87929.82
## 19  81546.98  69485.57  93608.39
## 20  82491.97  70427.39  94556.56
## 21  90051.94  77944.06 102159.83
## 22  92886.93  80754.66 105019.20
## 23 100446.90  88228.15 112665.65
## 24 103281.89  91022.76 115541.02
## 25 108006.87  95670.98 120342.77
## 26 110841.86  98454.30 123229.42
## 27 115566.84 103084.00 128049.68
## 28 116511.84 104008.59 129015.09
## 29 123126.81 110468.27 135785.35
## 30 125016.80 112309.98 137723.63

# predict(reg,type="prediction")
# Adjusted R-squared value for the above model is 0.9554 

# we may have to do transformation of variables for better R-squared value
# Applying transformations

# Logarthmic transformation
reg_log<-lm(sh~log(ye))  # Regression using logarthmic transformation
summary(reg_log)

## 
## Call:
## lm(formula = sh ~ log(ye))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15392.6  -7523.0    559.7   6336.1  20629.8 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    14928       5156   2.895  0.00727 ** 
## log(ye)        40582       3172  12.792 3.25e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10660 on 28 degrees of freedom
## Multiple R-squared:  0.8539, Adjusted R-squared:  0.8487 
## F-statistic: 163.6 on 1 and 28 DF,  p-value: 3.25e-13

confint(reg_log,level=0.95)

##                 2.5 %   97.5 %
## (Intercept)  4365.921 25490.02
## log(ye)     34083.512 47080.46

predict(reg_log,interval="predict")

## Warning in predict.lm(reg_log, interval = "predict"): predictions on current data refer to _future_ responses

##          fit       lwr       upr
## 1   18795.85 -5225.823  42817.52
## 2   25575.24  1946.237  49204.23
## 3   31382.55  8054.979  54710.13
## 4   43057.26 20232.824  65881.70
## 5   46925.14 24235.859  69614.42
## 6   58136.05 35746.140  80525.96
## 7   59511.84 37149.017  81874.67
## 8   62130.94 39813.758  84448.13
## 9   62130.94 39813.758  84448.13
## 10  68022.72 45779.622  90265.82
## 11  70159.11 47933.039  92385.17
## 12  71186.55 48966.805  93406.30
## 13  71186.55 48966.805  93406.30
## 14  72188.63 49973.872  94403.38
## 15  75966.42 53760.064  98172.78
## 16  79422.30 57209.189 101635.40
## 17  81045.79 58824.757 103266.83
## 18  82606.83 60375.307 104838.35
## 19  86959.07 64683.513 109234.62
## 20  87641.13 65356.711 109925.56
## 21  92720.50 70353.368 115087.64
## 22  94472.51 72070.071 116874.96
## 23  98805.37 76300.817 121309.93
## 24 100317.92 77772.799 122863.04
## 25 102719.92 80105.200 125334.64
## 26 104095.71 81438.281 126753.14
## 27 106289.87 83560.068 129019.67
## 28 106714.81 83970.397 129459.23
## 29 109571.01 86723.331 132418.68
## 30 110351.45 87474.053 133228.86

# Multiple R-squared value for the above model is 0.8539
# Adjusted R-squared:  0.8487 

# we may have to do different transformation for a better R-squared value
# Applying different transformations

# Exponential model 
reg_exp<-lm(log(sh)~ye) # regression using Exponential model
summary(reg_exp)

## 
## Call:
## lm(formula = log(sh) ~ ye)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.18949 -0.06946 -0.01068  0.06932  0.19029 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.507402   0.038443  273.33   <2e-16 ***
## ye           0.125453   0.006406   19.59   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09789 on 28 degrees of freedom
## Multiple R-squared:  0.932,  Adjusted R-squared:  0.9295 
## F-statistic: 383.6 on 1 and 28 DF,  p-value: < 2.2e-16

confint(reg_exp,level=0.95)

##                  2.5 %     97.5 %
## (Intercept) 10.4286558 10.5861480
## ye           0.1123316  0.1385742

exp(predict(reg_exp,interval="predict"))

## Warning in predict.lm(reg_exp, interval = "predict"): predictions on current data refer to _future_ responses

##          fit       lwr       upr
## 1   41998.96  34002.70  51875.66
## 2   43066.07  34890.07  53158.00
## 3   44160.29  35799.49  54473.71
## 4   47019.03  38172.93  57915.09
## 5   48213.69  39163.64  59355.04
## 6   52639.14  42827.23  64699.01
## 7   53303.68  43376.45  65502.87
## 8   54658.01  44495.02  67142.31
## 9   54658.01  44495.02  67142.31
## 10  58196.33  47412.46  71432.97
## 11  59674.98  48629.49  73229.31
## 12  60428.34  49249.05  74145.27
## 13  60428.34  49249.05  74145.27
## 14  61191.20  49876.09  75073.32
## 15  64340.20  52460.71  78909.75
## 16  67651.25  55171.84  82953.39
## 17  69370.13  56576.64  85056.57
## 18  71132.68  58015.25  87216.02
## 19  76693.63  62541.63  94047.97
## 20  77661.84  63327.77  95240.38
## 21  85860.70  69962.12 105372.17
## 22  89153.73  72615.43 109458.64
## 23  98565.81  80164.21 121191.48
## 24 102346.11  83181.93 125925.51
## 25 108971.54  88451.84 134251.55
## 26 113150.93  91764.03 139522.36
## 27 120475.82  97547.11 148793.98
## 28 121996.74  98744.48 150724.43
## 29 133194.62 107525.31 164991.92
## 30 136578.83 110167.31 169322.25

# Multiple R-squared value - 0.932
# Adjusted R SQuare Value - 0.9295 
# Higher the R-sqaured value - Better chances of getting good model 
# for Salary hike and Years of Experience

# Quadratic model
ye.sh[,"ye_sq"] = ye*ye

# Quadratic model
quad_mod <- lm(sh~ye+I(ye^2),data=ye.sh)
summary(quad_mod)

## 
## Call:
## lm(formula = sh ~ ye + I(ye^2), data = ye.sh)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -7835  -4026   -493   3309  11579 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 26214.93    4554.67   5.756 4.04e-06 ***
## ye           9259.28    1811.01   5.113 2.25e-05 ***
## I(ye^2)        16.39     152.12   0.108    0.915    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5893 on 27 degrees of freedom
## Multiple R-squared:  0.957,  Adjusted R-squared:  0.9538 
## F-statistic: 300.3 on 2 and 27 DF,  p-value: < 2.2e-16

confint(quad_mod,level=0.95)

##                  2.5 %     97.5 %
## (Intercept) 16869.5127 35560.3527
## ye           5543.4050 12975.1628
## I(ye^2)      -295.7344   328.5195

predict(quad_mod,interval="predict")

## Warning in predict.lm(quad_mod, interval = "predict"): predictions on current data refer to _future_ responses

##          fit       lwr       upr
## 1   36419.98  22934.59  49905.37
## 2   38279.71  25011.39  51548.02
## 3   40140.74  27059.48  53222.00
## 4   44799.07  32066.85  57531.29
## 5   46664.70  34030.20  59299.19
## 6   53204.72  40764.58  65644.86
## 7   54140.32  41712.88  66567.76
## 8   56012.50  43601.83  68423.18
## 9   56012.50  43601.83  68423.18
## 10  60698.70  48290.87  73106.53
## 11  62575.47  50157.82  74993.12
## 12  63514.35  51090.24  75938.46
## 13  63514.35  51090.24  75938.46
## 14  64453.56  52022.20  76884.91
## 15  68213.66  55748.59  80678.73
## 16  71979.01  59479.19  84478.83
## 17  73863.65  61348.25  86379.05
## 18  75749.60  63220.68  88278.53
## 19  81415.33  68861.91  93968.76
## 20  82360.77  69805.84  94915.70
## 21  89936.06  77392.43 102479.68
## 22  92782.20  80249.63 105314.77
## 23 100386.34  87872.00 112900.67
## 24 103243.30  90719.45 115767.14
## 25 108011.46  95430.60 120592.31
## 26 110876.29  98226.05 123526.52
## 27 115657.56 102810.50 128504.62
## 28 116614.80 103713.75 129515.84
## 29 123324.64 109876.71 136772.58
## 30 125244.69 111578.68 138910.71

# Adjusted R-Squared = 0.9538 
#Multiple R -Squared Value = 0.957

# Quadratic model
qd_model <- lm(sh~ye+ye_sq,data=ye.sh)
summary(qd_model)

## 
## Call:
## lm(formula = sh ~ ye + ye_sq, data = ye.sh)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -7835  -4026   -493   3309  11579 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 26214.93    4554.67   5.756 4.04e-06 ***
## ye           9259.28    1811.01   5.113 2.25e-05 ***
## ye_sq          16.39     152.12   0.108    0.915    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5893 on 27 degrees of freedom
## Multiple R-squared:  0.957,  Adjusted R-squared:  0.9538 
## F-statistic: 300.3 on 2 and 27 DF,  p-value: < 2.2e-16

confint(quad_mod,level=0.95)

##                  2.5 %     97.5 %
## (Intercept) 16869.5127 35560.3527
## ye           5543.4050 12975.1628
## I(ye^2)      -295.7344   328.5195

predict(quad_mod,interval="predict")

## Warning in predict.lm(quad_mod, interval = "predict"): predictions on current data refer to _future_ responses

##          fit       lwr       upr
## 1   36419.98  22934.59  49905.37
## 2   38279.71  25011.39  51548.02
## 3   40140.74  27059.48  53222.00
## 4   44799.07  32066.85  57531.29
## 5   46664.70  34030.20  59299.19
## 6   53204.72  40764.58  65644.86
## 7   54140.32  41712.88  66567.76
## 8   56012.50  43601.83  68423.18
## 9   56012.50  43601.83  68423.18
## 10  60698.70  48290.87  73106.53
## 11  62575.47  50157.82  74993.12
## 12  63514.35  51090.24  75938.46
## 13  63514.35  51090.24  75938.46
## 14  64453.56  52022.20  76884.91
## 15  68213.66  55748.59  80678.73
## 16  71979.01  59479.19  84478.83
## 17  73863.65  61348.25  86379.05
## 18  75749.60  63220.68  88278.53
## 19  81415.33  68861.91  93968.76
## 20  82360.77  69805.84  94915.70
## 21  89936.06  77392.43 102479.68
## 22  92782.20  80249.63 105314.77
## 23 100386.34  87872.00 112900.67
## 24 103243.30  90719.45 115767.14
## 25 108011.46  95430.60 120592.31
## 26 110876.29  98226.05 123526.52
## 27 115657.56 102810.50 128504.62
## 28 116614.80 103713.75 129515.84
## 29 123324.64 109876.71 136772.58
## 30 125244.69 111578.68 138910.71

# Adjusted R-Squared = 0.9538 
#Multiple R -Squared Value = 0.957

# Cubic model
poly_mod <- lm(sh~ye+I(ye^2)+I(ye^3),data=ye.sh)
summary(poly_mod) # 0.9636

## 
## Call:
## lm(formula = sh ~ ye + I(ye^2) + I(ye^3), data = ye.sh)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -7468  -4286  -1100   2639  10412 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 38863.07    7214.75   5.387 1.21e-05 ***
## ye           -718.71    4892.11  -0.147   0.8843    
## I(ye^2)      2099.35     968.36   2.168   0.0395 *  
## I(ye^3)      -122.92      56.52  -2.175   0.0389 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5524 on 26 degrees of freedom
## Multiple R-squared:  0.9636, Adjusted R-squared:  0.9594 
## F-statistic: 229.4 on 3 and 26 DF,  p-value: < 2.2e-16

confint(poly_mod,level=0.95)

##                   2.5 %      97.5 %
## (Intercept)  24032.9470 53693.19673
## ye          -10774.5874  9337.17053
## I(ye^2)        108.8691  4089.83482
## I(ye^3)       -239.0934    -6.73744

predict(poly_mod,interval="predict")

## Warning in predict.lm(poly_mod, interval = "predict"): predictions on current data refer to _future_ responses

##          fit       lwr       upr
## 1   40449.11  27225.47  53672.74
## 2   41206.61  28443.63  53969.59
## 3   42093.71  29671.89  54515.53
## 4   44839.74  32883.55  56795.93
## 5   46133.97  34259.01  58008.93
## 6   51436.58  39635.79  63237.38
## 7   52282.40  40481.08  64083.71
## 8   54032.88  42229.44  65836.31
## 9   54032.88  42229.44  65836.31
## 10  58717.94  46916.97  70518.91
## 11  60700.03  48905.33  72494.74
## 12  61711.28  49920.66  73501.90
## 13  61711.28  49920.66  73501.90
## 14  62735.02  50948.94  74521.10
## 15  66940.09  55173.09  78707.10
## 16  71285.97  59529.80  83042.13
## 17  73496.95  61739.31  85254.59
## 18  75725.44  63960.18  87490.69
## 19  82456.89  70627.62  94286.16
## 20  83577.76  71732.13  95423.39
## 21  92401.35  80394.05 104408.65
## 22  95595.79  83530.42 107661.17
## 23 103603.74  91465.11 115742.37
## 24 106358.46  94235.02 118481.90
## 25 110570.09  98511.11 122629.07
## 26 112836.87 100814.05 124859.68
## 27 116117.25 104045.46 128189.04
## 28 116692.05 104577.17 128806.94
## 29 119867.63 106823.53 132911.74
## 30 120480.23 106880.01 134080.44

# Adjusted R-Squared = 0.9594
#Multiple R -Squared Value = 0.9636

model_R_Squared_values <- list(model=NULL,R_squared=NULL)
model_R_Squared_values[["model"]] <- c("reg","reg_log","reg_exp","quad_mod","poly_mod")
model_R_Squared_values[["R_squared"]] <- c(0.9554,0.8487,0.9295,0.9538,0.9594)
Final <- cbind(model_R_Squared_values[["model"]],model_R_Squared_values[["R_squared"]])
View(model_R_Squared_values)
View(Final)

# Cubic  model gives the best Adjusted R-Squared value
predicted_Value <- predict(poly_mod)
predicted_Value

##         1         2         3         4         5         6         7 
##  40449.11  41206.61  42093.71  44839.74  46133.97  51436.58  52282.40 
##         8         9        10        11        12        13        14 
##  54032.88  54032.88  58717.94  60700.03  61711.28  61711.28  62735.02 
##        15        16        17        18        19        20        21 
##  66940.09  71285.97  73496.95  75725.44  82456.89  83577.76  92401.35 
##        22        23        24        25        26        27        28 
##  95595.79 103603.74 106358.46 110570.09 112836.87 116117.25 116692.05 
##        29        30 
## 119867.63 120480.23

Final <- cbind(YearsofExp=ye.sh$YearsExperience,Sal_Hike = ye.sh$Salary,Pred_sal_hike=predicted_Value)

View(Final)

rmse<-sqrt(mean((predicted_Value-sh)^2))
rmse

## [1] 5142.642

plot(poly_mod)

hist(residuals(poly_mod)) # close to normal distribution