# choose the Delivery_Time.csv data set
dl.tm <- read.csv(file.choose()) 
# windows()
dt.st <- dl.tm
View(dt.st)

# 21 Observations of 2 variables

# Scatter Diagram (Plot x,y)
plot(dt.st$Sorting.Time,dt.st$Delivery.Time)

# Other Exploratory data analysis and Plots

boxplot(dt.st)

hist(dt.st$Sorting.Time)

hist(dt.st$Delivery.Time)

summary(dt.st)
##  Delivery.Time    Sorting.Time  
##  Min.   : 8.00   Min.   : 2.00  
##  1st Qu.:13.50   1st Qu.: 4.00  
##  Median :17.83   Median : 6.00  
##  Mean   :16.79   Mean   : 6.19  
##  3rd Qu.:19.75   3rd Qu.: 8.00  
##  Max.   :29.00   Max.   :10.00
# Correlation coefficient value for Delivery Time and Sorting Time
dt<- dt.st$Delivery.Time
st <- dt.st$Sorting.Time
cor(st,dt)
## [1] 0.8259973
# If |r| is greater than  0.85 then Co-relation is Strong(Correlation Co-efficient = 0.8259973). 
# This has a moderate Correlation 

# Simple model without using any transformation
reg<-lm(dt~st)
summary(reg)
## 
## Call:
## lm(formula = dt ~ st)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1729 -2.0298 -0.0298  0.8741  6.6722 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.5827     1.7217   3.823  0.00115 ** 
## st            1.6490     0.2582   6.387 3.98e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.935 on 19 degrees of freedom
## Multiple R-squared:  0.6823, Adjusted R-squared:  0.6655 
## F-statistic:  40.8 on 1 and 19 DF,  p-value: 3.983e-06
# Probability value should be less than 0.05(0.00115)
# The multiple-R-Squared Value is 0.6823 which is lesser than 0.8(In General)
# Adjusted R-Squared Value is 0.6655 
# The Probability Value for F-Statistic is 3.983e-06(Overall Probability Model is also less than 0.05)
confint(reg,level = 0.95) # confidence interval
##                2.5 %    97.5 %
## (Intercept) 2.979134 10.186334
## st          1.108673  2.189367
# The above code will get you 2 equations 
# 1 to caliculate the lower range and other for upper range

# Function to Predict the above model 
predict(reg,interval="predict")
## Warning in predict.lm(reg, interval = "predict"): predictions on current data refer to _future_ responses
##          fit       lwr      upr
## 1  23.072933 16.457161 29.68870
## 2  13.178814  6.780993 19.57663
## 3  16.476853 10.188630 22.76508
## 4  21.423913 14.955850 27.89198
## 5  23.072933 16.457161 29.68870
## 6  16.476853 10.188630 22.76508
## 7  18.125873 11.823294 24.42845
## 8  11.529794  5.010345 18.04924
## 9  23.072933 16.457161 29.68870
## 10 21.423913 14.955850 27.89198
## 11 19.774893 13.411938 26.13785
## 12 13.178814  6.780993 19.57663
## 13 18.125873 11.823294 24.42845
## 14 11.529794  5.010345 18.04924
## 15 11.529794  5.010345 18.04924
## 16 13.178814  6.780993 19.57663
## 17 16.476853 10.188630 22.76508
## 18 18.125873 11.823294 24.42845
## 19  9.880774  3.198090 16.56346
## 20 18.125873 11.823294 24.42845
## 21 14.827833  8.507631 21.14804
# predict(reg,type="prediction")
# Adjusted R-squared value for the above model is 0.6655 

# we may have to do transformation of variables for better R-squared value
# Applying transformations

# Logarthmic transformation
reg_log<-lm(dt~log(st))  # Regression using logarthmic transformation
summary(reg_log)
## 
## Call:
## lm(formula = dt ~ log(st))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0829 -2.0133 -0.1965  0.9351  7.0171 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.160      2.455   0.472    0.642    
## log(st)        9.043      1.373   6.587 2.64e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.873 on 19 degrees of freedom
## Multiple R-squared:  0.6954, Adjusted R-squared:  0.6794 
## F-statistic: 43.39 on 1 and 19 DF,  p-value: 2.642e-06
confint(reg_log,level=0.95)
##                2.5 %    97.5 %
## (Intercept) -3.97778  6.297147
## log(st)      6.16977 11.917057
predict(reg_log,interval="predict")
## Warning in predict.lm(reg_log, interval = "predict"): predictions on current data refer to _future_ responses
##         fit        lwr      upr
## 1  21.98291 15.6099875 28.35584
## 2  13.69652  7.4628028 19.93023
## 3  17.36331 11.2049447 23.52167
## 4  21.03009 14.7287585 27.33143
## 5  21.98291 15.6099875 28.35584
## 6  17.36331 11.2049447 23.52167
## 7  18.75735 12.5700473 24.94466
## 8  11.09489  4.6786298 17.51115
## 9  21.98291 15.6099875 28.35584
## 10 21.03009 14.7287585 27.33143
## 11 19.96493 13.7271824 26.20268
## 12 13.69652  7.4628028 19.93023
## 13 18.75735 12.5700473 24.94466
## 14 11.09489  4.6786298 17.51115
## 15 11.09489  4.6786298 17.51115
## 16 13.69652  7.4628028 19.93023
## 17 17.36331 11.2049447 23.52167
## 18 18.75735 12.5700473 24.94466
## 19  7.42810  0.5911537 14.26505
## 20 18.75735 12.5700473 24.94466
## 21 15.71450  9.5493253 21.87967
# Multiple R-squared value for the above model is 0.6954
# Adjusted R-squared:  0.6794 

# we may have to do different transformation for a better R-squared value
# Applying different transformations

# Exponential model 
reg_exp<-lm(log(dt)~st) # regression using Exponential model
summary(reg_exp)
## 
## Call:
## lm(formula = log(dt) ~ st)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29209 -0.13364  0.02065  0.08421  0.41892 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.12137    0.10297  20.601 1.86e-14 ***
## st           0.10555    0.01544   6.836 1.59e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1755 on 19 degrees of freedom
## Multiple R-squared:  0.7109, Adjusted R-squared:  0.6957 
## F-statistic: 46.73 on 1 and 19 DF,  p-value: 1.593e-06
confint(reg_exp,level=0.95)
##                  2.5 %    97.5 %
## (Intercept) 1.90584807 2.3368956
## st          0.07323457 0.1378686
exp(predict(reg_exp,interval="predict"))
## Warning in predict.lm(reg_exp, interval = "predict"): predictions on current data refer to _future_ responses
##         fit       lwr      upr
## 1  23.97203 16.138575 35.60775
## 2  12.72512  8.679275 18.65695
## 3  15.71603 10.789743 22.89153
## 4  21.57071 14.650800 31.75904
## 5  23.97203 16.138575 35.60775
## 6  15.71603 10.789743 22.89153
## 7  17.46560 11.980604 25.46174
## 8  11.45042  7.753250 16.91061
## 9  23.97203 16.138575 35.60775
## 10 21.57071 14.650800 31.75904
## 11 19.40993 13.266335 28.39859
## 12 12.72512  8.679275 18.65695
## 13 17.46560 11.980604 25.46174
## 14 11.45042  7.753250 16.91061
## 15 11.45042  7.753250 16.91061
## 16 12.72512  8.679275 18.65695
## 17 15.71603 10.789743 22.89153
## 18 17.46560 11.980604 25.46174
## 19 10.30341  6.908812 15.36592
## 20 17.46560 11.980604 25.46174
## 21 14.14173  9.690362 20.63787
# R-squared value - 0.7109
# Adjusted R SQuare Value - 0.6957 
# Higher the R-sqaured value - Better chances of getting good model 
# for Delivery Time and Sorting Time

# Quadratic model
dt.st[,"st_sq"] = st*st

# Quadratic model
quad_mod <- lm(dt~st+I(st^2),data=dt.st)
summary(quad_mod)
## 
## Call:
## lm(formula = dt ~ st + I(st^2), data = dt.st)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4324 -1.6951 -0.5365  0.9075  6.6676 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   3.5222     4.1597   0.847   0.4082  
## st            2.8130     1.4608   1.926   0.0701 .
## I(st^2)      -0.0932     0.1151  -0.810   0.4286  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.962 on 18 degrees of freedom
## Multiple R-squared:  0.6934, Adjusted R-squared:  0.6594 
## F-statistic: 20.36 on 2 and 18 DF,  p-value: 2.391e-05
confint(quad_mod,level=0.95)
##                  2.5 %     97.5 %
## (Intercept) -5.2169258 12.2613936
## st          -0.2560669  5.8820703
## I(st^2)     -0.3349939  0.1485975
predict(quad_mod,interval="predict")
## Warning in predict.lm(quad_mod, interval = "predict"): predictions on current data refer to _future_ responses
##          fit       lwr      upr
## 1  22.332430 15.360768 29.30409
## 2  13.283069  6.796484 19.76965
## 3  17.045108 10.506802 23.58341
## 4  21.290194 14.728917 27.85147
## 5  22.332430 15.360768 29.30409
## 6  17.045108 10.506802 23.58341
## 7  18.646533 12.120732 25.17233
## 8  11.122455  4.434281 17.81063
## 9  22.332430 15.360768 29.30409
## 10 21.290194 14.728917 27.85147
## 11 20.061562 13.573170 26.54995
## 12 13.283069  6.796484 19.76965
## 13 18.646533 12.120732 25.17233
## 14 11.122455  4.434281 17.81063
## 15 11.122455  4.434281 17.81063
## 16 13.283069  6.796484 19.76965
## 17 17.045108 10.506802 23.58341
## 18 18.646533 12.120732 25.17233
## 19  8.775444  1.423580 16.12731
## 20 18.646533 12.120732 25.17233
## 21 15.257287  8.758744 21.75583
# Adjusted R-Squared = 0.6594
#Multiple R -Squared Value = 0.6934

# Quadratic model
qd_model <- lm(dt~st+st_sq,data=dt.st)
summary(qd_model)
## 
## Call:
## lm(formula = dt ~ st + st_sq, data = dt.st)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4324 -1.6951 -0.5365  0.9075  6.6676 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   3.5222     4.1597   0.847   0.4082  
## st            2.8130     1.4608   1.926   0.0701 .
## st_sq        -0.0932     0.1151  -0.810   0.4286  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.962 on 18 degrees of freedom
## Multiple R-squared:  0.6934, Adjusted R-squared:  0.6594 
## F-statistic: 20.36 on 2 and 18 DF,  p-value: 2.391e-05
confint(quad_mod,level=0.95)
##                  2.5 %     97.5 %
## (Intercept) -5.2169258 12.2613936
## st          -0.2560669  5.8820703
## I(st^2)     -0.3349939  0.1485975
predict(quad_mod,interval="predict")
## Warning in predict.lm(quad_mod, interval = "predict"): predictions on current data refer to _future_ responses
##          fit       lwr      upr
## 1  22.332430 15.360768 29.30409
## 2  13.283069  6.796484 19.76965
## 3  17.045108 10.506802 23.58341
## 4  21.290194 14.728917 27.85147
## 5  22.332430 15.360768 29.30409
## 6  17.045108 10.506802 23.58341
## 7  18.646533 12.120732 25.17233
## 8  11.122455  4.434281 17.81063
## 9  22.332430 15.360768 29.30409
## 10 21.290194 14.728917 27.85147
## 11 20.061562 13.573170 26.54995
## 12 13.283069  6.796484 19.76965
## 13 18.646533 12.120732 25.17233
## 14 11.122455  4.434281 17.81063
## 15 11.122455  4.434281 17.81063
## 16 13.283069  6.796484 19.76965
## 17 17.045108 10.506802 23.58341
## 18 18.646533 12.120732 25.17233
## 19  8.775444  1.423580 16.12731
## 20 18.646533 12.120732 25.17233
## 21 15.257287  8.758744 21.75583
# Adjusted R-Squared = 0.6594
#Multiple R -Squared Value = 0.6934

# Cubic model
poly_mod <- lm(dt~st+I(st^2)+I(st^3),data=dt.st)
summary(poly_mod) # 0.9811
## 
## Call:
## lm(formula = dt ~ st + I(st^2) + I(st^3), data = dt.st)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8972 -1.7972 -0.1601  0.8077  6.2028 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.15818   10.98653  -0.378    0.710
## st           7.50248    6.37003   1.178    0.255
## I(st^2)     -0.92525    1.10553  -0.837    0.414
## I(st^3)      0.04446    0.05874   0.757    0.460
## 
## Residual standard error: 2.998 on 17 degrees of freedom
## Multiple R-squared:  0.7034, Adjusted R-squared:  0.6511 
## F-statistic: 13.44 on 3 and 17 DF,  p-value: 9.586e-05
confint(poly_mod,level=0.95)
##                    2.5 %     97.5 %
## (Intercept) -27.33772564 19.0213717
## st           -5.93710786 20.9420660
## I(st^2)      -3.25771246  1.4072034
## I(st^3)      -0.07947156  0.1683837
predict(poly_mod,interval="predict")
## Warning in predict.lm(poly_mod, interval = "predict"): predictions on current data refer to _future_ responses
##          fit        lwr      upr
## 1  22.797240 15.5939839 30.00050
## 2  13.892856  7.0844910 20.70122
## 3  17.150048 10.4982995 23.80180
## 4  20.827000 14.0344853 27.61951
## 5  22.797240 15.5939839 30.00050
## 6  17.150048 10.4982995 23.80180
## 7  18.270140 11.5550572 24.98522
## 8  11.222284  4.4189573 18.02561
## 9  22.797240 15.5939839 30.00050
## 10 20.827000 14.0344853 27.61951
## 11 19.406879 12.5644227 26.24933
## 12 13.892856  7.0844910 20.70122
## 13 18.270140 11.5550572 24.98522
## 14 11.222284  4.4189573 18.02561
## 15 11.222284  4.4189573 18.02561
## 16 13.892856  7.0844910 20.70122
## 17 17.150048 10.4982995 23.80180
## 18 18.270140 11.5550572 24.98522
## 19  7.501412 -0.7718603 15.77468
## 20 18.270140 11.5550572 24.98522
## 21 15.779865  9.0162244 22.54351
# Adjusted R-Squared = 0.6511
#Multiple R -Squared Value = 0.7034

model_R_Squared_values <- list(model=NULL,R_squared=NULL)
model_R_Squared_values[["model"]] <- c("reg","reg_log","reg_exp","quad_mod","poly_mod")
model_R_Squared_values[["R_squared"]] <- c(0.6655,0.6794,0.6957,0.6594,0.6511)
Final <- cbind(model_R_Squared_values[["model"]],model_R_Squared_values[["R_squared"]])
View(model_R_Squared_values)
View(Final)

# Exponential  model gives the best Adjusted R-Squared value
predicted_Value <- exp(predict(reg_exp))
predicted_Value
##        1        2        3        4        5        6        7        8 
## 23.97203 12.72512 15.71603 21.57071 23.97203 15.71603 17.46560 11.45042 
##        9       10       11       12       13       14       15       16 
## 23.97203 21.57071 19.40993 12.72512 17.46560 11.45042 11.45042 12.72512 
##       17       18       19       20       21 
## 15.71603 17.46560 10.30341 17.46560 14.14173
Final <- cbind(Sorting_Time=dt.st$Sorting.Time ,Delivery_Time = dt.st$Delivery.Time,Predicted_Delivery_time=predicted_Value)

View(Final)

rmse<-sqrt(mean((predicted_Value-dt)^2))
rmse
## [1] 2.94025
plot(reg_exp)

hist(residuals(reg_exp)) # close to normal distribution