# Simple Linear Regression-Q1
##predict Delevery Time(Y/Output) gained using Sorting Time(X/input)
Q2 <- read.csv("D:\\DataScience\\Assignments\\SimpleLinearRegression\\delivery_time.csv") #Importing Data set #
attach(Q2)
View(Q2)
# Column Names in given dataset Q2
colnames(Q2)
## [1] "Delivery.Time" "Sorting.Time"
# 1St Movement Business Decission(Mean,Meadian,Range)
summary(Delivery.Time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.00 13.50 17.83 16.79 19.75 29.00
summary(Sorting.Time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 4.00 6.00 6.19 8.00 10.00
#Delivery.time having some +ve skewness
# 2Nd movement Business Decission(Variance,Standard Deviation)
var(Sorting.Time)
## [1] 6.461905
var(Delivery.Time)
## [1] 25.75462
sd(Sorting.Time)
## [1] 2.542028
sd(Delivery.Time)
## [1] 5.074901
# 3rd & 4th Business Decission(Skewness and Kurtosis)
library(e1071)
skewness(Delivery.Time)
## [1] 0.3036468
kurtosis((Delivery.Time))
## [1] -0.3021186
barplot(Delivery.Time)

boxplot(Delivery.Time,horizontal = T)

hist(Delivery.Time)

qqnorm(Delivery.Time)
qqline(Delivery.Time)

skewness(Sorting.Time)
## [1] 0.04059837
kurtosis(Sorting.Time)
## [1] -1.335955
barplot(Sorting.Time)

hist(Sorting.Time)

boxplot(Sorting.Time,horizontal = T )

#Based on Boxplot we dont have outliears
qqnorm(Sorting.Time)
qqline(Sorting.Time)

# Based on qqnorm we confirmed as Calories data is linearly Distributed.
#Corelation Coefficient(r-value>0.85 Strong Corelation) value for X-axis and Y-axis
cor(Q2$Delivery.Time,Q2$Sorting.Time)
## [1] 0.8259973
#Based on this value cor value we can build 82% Accurate Model
plot(Q2)

# Scatter Plot is used for relation between two variables.
#Based on this scatter plot we Can expect this model variables are having +ve Strong Corelation
Model2 <- lm(Delivery.Time ~ Sorting.Time)
summary(Model2)
##
## Call:
## lm(formula = Delivery.Time ~ Sorting.Time)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1729 -2.0298 -0.0298 0.8741 6.6722
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.5827 1.7217 3.823 0.00115 **
## Sorting.Time 1.6490 0.2582 6.387 3.98e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.935 on 19 degrees of freedom
## Multiple R-squared: 0.6823, Adjusted R-squared: 0.6655
## F-statistic: 40.8 on 1 and 19 DF, p-value: 3.983e-06
#Here R2 value is less than 0.68 so we need some transformations
Model2t <- lm(log(Delivery.Time) ~ log(Sorting.Time))
summary(Model2t)
##
## Call:
## lm(formula = log(Delivery.Time) ~ log(Sorting.Time))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.23303 -0.09050 -0.00825 0.08897 0.36439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.74199 0.13312 13.086 5.92e-11 ***
## log(Sorting.Time) 0.59752 0.07446 8.024 1.60e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1558 on 19 degrees of freedom
## Multiple R-squared: 0.7722, Adjusted R-squared: 0.7602
## F-statistic: 64.39 on 1 and 19 DF, p-value: 1.602e-07
# Based on Summary we can tell like intercept(B0) is Symentric and sortingTime(B1) also Symentric.So we can use these two values(B0,B1) in our dataset.
#R^2 value also Having 0.77 so R^2 Value >0.8 we tell as this is not Strong Model