Simple Linear Regression

Delivery Time Dataset

Assignment 11

mydata <- read.csv("C:\\Users\\RISHI RAHUL\\Desktop\\DS\\2 SLR\\Assignment\\delivery_time.csv")
attach(mydata)
colnames(mydata)
## [1] "Delivery.Time" "Sorting.Time"
# 1st, 2md, 3rd, 4th Moment Business Decisions

summary(mydata)
##  Delivery.Time    Sorting.Time  
##  Min.   : 8.00   Min.   : 2.00  
##  1st Qu.:13.50   1st Qu.: 4.00  
##  Median :17.83   Median : 6.00  
##  Mean   :16.79   Mean   : 6.19  
##  3rd Qu.:19.75   3rd Qu.: 8.00  
##  Max.   :29.00   Max.   :10.00
var(Delivery.Time)
## [1] 25.75462
var(Sorting.Time)
## [1] 6.461905
sd(Delivery.Time)
## [1] 5.074901
sd(Sorting.Time)
## [1] 2.542028
library(e1071)
skewness(Delivery.Time)
## [1] 0.3036468
kurtosis(Sorting.Time)
## [1] -1.335955
# Plotting

boxplot(Delivery.Time)

boxplot(Sorting.Time, horizontal = TRUE)

hist(Sorting.Time)

qqnorm(Sorting.Time)

plot(mydata)

# Correlation
cor(mydata)
##               Delivery.Time Sorting.Time
## Delivery.Time     1.0000000    0.8259973
## Sorting.Time      0.8259973    1.0000000
# Model

model <- lm(Delivery.Time~Sorting.Time)
summary(model) # R-squared value : 0.6823
## 
## Call:
## lm(formula = Delivery.Time ~ Sorting.Time)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1729 -2.0298 -0.0298  0.8741  6.6722 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.5827     1.7217   3.823  0.00115 ** 
## Sorting.Time   1.6490     0.2582   6.387 3.98e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.935 on 19 degrees of freedom
## Multiple R-squared:  0.6823, Adjusted R-squared:  0.6655 
## F-statistic:  40.8 on 1 and 19 DF,  p-value: 3.983e-06
qqplot(Delivery.Time,Sorting.Time)

library(car)
## Warning: package 'car' was built under R version 3.5.1
## Loading required package: carData
influence.measures(model)
## Influence measures of
##   lm(formula = Delivery.Time ~ Sorting.Time) :
## 
##       dfb.1_  dfb.Sr.T    dffit cov.r   cook.d    hat inf
## 1   1.91e-01 -0.278610 -0.33248 1.245 5.65e-02 0.1599    
## 2   3.03e-02 -0.022432  0.03389 1.216 6.06e-04 0.0847    
## 3   1.14e-01 -0.019795  0.25857 1.015 3.29e-02 0.0479    
## 4  -1.45e-01  0.242465  0.32346 1.139 5.27e-02 0.1087    
## 5  -6.23e-01  0.908714  1.08441 0.735 4.62e-01 0.1599   *
## 6  -3.81e-02  0.006603 -0.08626 1.151 3.89e-03 0.0479    
## 7   4.62e-03  0.021845  0.07042 1.165 2.60e-03 0.0527    
## 8  -2.67e-01  0.219435 -0.27796 1.203 3.96e-02 0.1264    
## 9   5.23e-01 -0.762526 -0.90995 0.860 3.52e-01 0.1599    
## 10  1.51e-01 -0.252137 -0.33636 1.131 5.68e-02 0.1087    
## 11 -1.31e-03  0.003138  0.00532 1.202 1.50e-05 0.0730    
## 12 -2.33e-01  0.173016 -0.26140 1.123 3.46e-02 0.0847    
## 13 -7.68e-03 -0.036290 -0.11698 1.145 7.12e-03 0.0527    
## 14 -3.86e-03  0.003174 -0.00402 1.275 8.53e-06 0.1264    
## 15  6.49e-02 -0.053338  0.06756 1.271 2.41e-03 0.1264    
## 16  1.62e-01 -0.119938  0.18121 1.171 1.70e-02 0.0847    
## 17 -9.41e-02  0.016308 -0.21302 1.061 2.28e-02 0.0479    
## 18 -8.38e-05 -0.000396 -0.00128 1.176 8.59e-07 0.0527    
## 19 -3.28e-01  0.285388 -0.33165 1.293 5.65e-02 0.1835    
## 20 -1.56e-03 -0.007378 -0.02378 1.175 2.98e-04 0.0527    
## 21  4.97e-01 -0.291894  0.67467 0.598 1.71e-01 0.0586   *
influenceIndexPlot(model)

influencePlot(model)

##       StudRes        Hat      CookD
## 1  -0.7620498 0.15991157 0.05651746
## 5   2.4855038 0.15991157 0.46205304
## 9  -2.0856523 0.15991157 0.35195395
## 19 -0.6995967 0.18349300 0.05651387
## 21  2.7045160 0.05858511 0.17082097
# Eliminating the Record 5 and 21
model2 <- lm(Delivery.Time~Sorting.Time, data = mydata[-c(5,21),])
summary(model2) # R-squared value : 0.7771
## 
## Call:
## lm(formula = Delivery.Time ~ Sorting.Time, data = mydata[-c(5, 
##     21), ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9376 -1.6943  0.1908  0.8925  3.9286 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.7970     1.2748   5.332 5.51e-05 ***
## Sorting.Time   1.5041     0.1954   7.699 6.13e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.076 on 17 degrees of freedom
## Multiple R-squared:  0.7771, Adjusted R-squared:  0.764 
## F-statistic: 59.27 on 1 and 17 DF,  p-value: 6.13e-07
# Eliminating the Record 5 and 21 and Logarthmic transformation
model3 <- lm(Delivery.Time~log(Sorting.Time), data = mydata[-c(5,21),])
summary(model3) # R-squared value : 0.8107
## 
## Call:
## lm(formula = Delivery.Time ~ log(Sorting.Time), data = mydata[-c(5, 
##     21), ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8673 -1.2227  0.2327  0.9693  4.0901 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2.0300     1.6837   1.206    0.244    
## log(Sorting.Time)   8.1375     0.9536   8.533  1.5e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.913 on 17 degrees of freedom
## Multiple R-squared:  0.8107, Adjusted R-squared:  0.7996 
## F-statistic: 72.82 on 1 and 17 DF,  p-value: 1.5e-07
# Eliminating the Record 5 and 21 and Logarthmic transformation on both side
model4 <- lm(log(Delivery.Time)~log(Sorting.Time), data = mydata[-c(5,21),])
summary(model4) # R-squared value : 0.8427
## 
## Call:
## lm(formula = log(Delivery.Time) ~ log(Sorting.Time), data = mydata[-c(5, 
##     21), ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.18371 -0.07527  0.01535  0.06894  0.20534 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.75818    0.10528  16.700 5.57e-12 ***
## log(Sorting.Time)  0.56907    0.05963   9.543 3.06e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1196 on 17 degrees of freedom
## Multiple R-squared:  0.8427, Adjusted R-squared:  0.8334 
## F-statistic: 91.07 on 1 and 17 DF,  p-value: 3.059e-08
confint(model4,level=0.95)
##                       2.5 %    97.5 %
## (Intercept)       1.5360581 1.9803092
## log(Sorting.Time) 0.4432565 0.6948752
predict(model4,interval="predict")
## Warning in predict.lm(model4, interval = "predict"): predictions on current data refer to _future_ responses
##         fit      lwr      upr
## 1  3.068506 2.798823 3.338189
## 2  2.547076 2.285026 2.809127
## 3  2.777813 2.518606 3.037019
## 4  3.008549 2.742260 3.274838
## 6  2.777813 2.518606 3.037019
## 7  2.865535 2.604786 3.126283
## 8  2.383366 2.113407 2.653326
## 9  3.068506 2.798823 3.338189
## 10 3.008549 2.742260 3.274838
## 11 2.941523 2.678288 3.204757
## 12 2.547076 2.285026 2.809127
## 13 2.865535 2.604786 3.126283
## 14 2.383366 2.113407 2.653326
## 15 2.383366 2.113407 2.653326
## 16 2.547076 2.285026 2.809127
## 17 2.777813 2.518606 3.037019
## 18 2.865535 2.604786 3.126283
## 19 2.152630 1.864086 2.441174
## 20 2.865535 2.604786 3.126283
# MODEL4 gives the best R-Squared value
model_final <- predict(model4)
model_final
##        1        2        3        4        6        7        8        9 
## 3.068506 2.547076 2.777813 3.008549 2.777813 2.865535 2.383366 3.068506 
##       10       11       12       13       14       15       16       17 
## 3.008549 2.941523 2.547076 2.865535 2.383366 2.383366 2.547076 2.777813 
##       18       19       20 
## 2.865535 2.152630 2.865535
rmse <- sqrt(mean((model_final-Delivery.Time)^2))
## Warning in model_final - Delivery.Time: longer object length is not a
## multiple of shorter object length
rmse
## [1] 14.9031
plot(model4)

hist(residuals(model4)) 

### MODEL4 predicts with an accuracy of 84.27%