# Simple Linear Regression-Q1
##predict Delevery Time(Y/Output) gained using Sorting Time(X/input) 


Q2 <-  read.csv("D:\\DataScience\\Assignments\\SimpleLinearRegression\\delivery_time.csv")  #Importing Data set #

attach(Q2)
View(Q2)

 # Column Names in given dataset Q2
colnames(Q2)  
## [1] "Delivery.Time" "Sorting.Time"
# 1St Movement Business Decission(Mean,Meadian,Range)
summary(Delivery.Time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.00   13.50   17.83   16.79   19.75   29.00
summary(Sorting.Time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    4.00    6.00    6.19    8.00   10.00
#Delivery.time having some +ve skewness
# 2Nd movement Business Decission(Variance,Standard Deviation)

var(Sorting.Time)
## [1] 6.461905
var(Delivery.Time)
## [1] 25.75462
sd(Sorting.Time)
## [1] 2.542028
sd(Delivery.Time)
## [1] 5.074901
# 3rd & 4th Business Decission(Skewness and Kurtosis)

library(e1071)
skewness(Delivery.Time)
## [1] 0.3036468
kurtosis((Delivery.Time))
## [1] -0.3021186
barplot(Delivery.Time)

boxplot(Delivery.Time,horizontal = T)

hist(Delivery.Time)

qqnorm(Delivery.Time)
qqline(Delivery.Time)

skewness(Sorting.Time)
## [1] 0.04059837
kurtosis(Sorting.Time)
## [1] -1.335955
barplot(Sorting.Time)

hist(Sorting.Time)

boxplot(Sorting.Time,horizontal = T )

#Based on Boxplot we dont have outliears


qqnorm(Sorting.Time)
qqline(Sorting.Time)

# Based on qqnorm we confirmed as Calories data is linearly Distributed.
#Corelation Coefficient(r-value>0.85 Strong Corelation) value for X-axis and Y-axis

cor(Q2$Delivery.Time,Q2$Sorting.Time)
## [1] 0.8259973
#Based on this value cor value we can build 82% Accurate Model

plot(Q2)  

# Scatter Plot is used for relation between two variables.
#Based on this scatter plot we Can expect this model variables are having  +ve Strong Corelation 
Model2 <- lm(Delivery.Time ~ Sorting.Time)

summary(Model2)
## 
## Call:
## lm(formula = Delivery.Time ~ Sorting.Time)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1729 -2.0298 -0.0298  0.8741  6.6722 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.5827     1.7217   3.823  0.00115 ** 
## Sorting.Time   1.6490     0.2582   6.387 3.98e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.935 on 19 degrees of freedom
## Multiple R-squared:  0.6823, Adjusted R-squared:  0.6655 
## F-statistic:  40.8 on 1 and 19 DF,  p-value: 3.983e-06
#Here R2 value is less than 0.68 so we need some transformations
Model2t <- lm(log(Delivery.Time) ~ log(Sorting.Time))


summary(Model2t)
## 
## Call:
## lm(formula = log(Delivery.Time) ~ log(Sorting.Time))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.23303 -0.09050 -0.00825  0.08897  0.36439 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.74199    0.13312  13.086 5.92e-11 ***
## log(Sorting.Time)  0.59752    0.07446   8.024 1.60e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1558 on 19 degrees of freedom
## Multiple R-squared:  0.7722, Adjusted R-squared:  0.7602 
## F-statistic: 64.39 on 1 and 19 DF,  p-value: 1.602e-07
# Based on Summary we can tell like intercept(B0) is Symentric and sortingTime(B1) also Symentric.So we can use these two values(B0,B1) in our dataset.

#R^2 value also Having 0.77 so R^2 Value >0.8 we tell as this is not Strong  Model