data <- read_csv("sm.csv")
## Parsed with column specification:
## cols(
## `Invoice ID` = col_character(),
## Branch = col_character(),
## City = col_character(),
## `Customer type` = col_character(),
## Gender = col_character(),
## `Product line` = col_character(),
## `Unit price` = col_double(),
## Quantity = col_double(),
## `Tax 5%` = col_double(),
## Total = col_double(),
## Date = col_character(),
## Time = col_time(format = ""),
## Payment = col_character(),
## cogs = col_double(),
## `gross margin percentage` = col_double(),
## `gross income` = col_double(),
## Rating = col_double()
## )
data
Formula : “Total Bill” (COGS) can be used to predict the “Unit Price”
data_tq <- data %>% select(cogs,`Unit price`)
data_tq %>%
ggplot() +
geom_point( aes(y = cogs, x=`Unit price`)) +
ggtitle("Scatter Plot by Cost of Good and UNIT Price")
lm_tq<- lm(data_tq$`Unit price` ~ data_tq$cogs, data_tq)
summary(lm_tq)
##
## Call:
## lm(formula = data_tq$`Unit price` ~ data_tq$cogs, data = data_tq)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.511 -16.204 -4.338 12.760 58.930
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.61006 1.07053 31.4 <2e-16 ***
## data_tq$cogs 0.07173 0.00277 25.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.5 on 998 degrees of freedom
## Multiple R-squared: 0.4019, Adjusted R-squared: 0.4013
## F-statistic: 670.6 on 1 and 998 DF, p-value: < 2.2e-16
plot(lm_tq)
Checking if “Total Bill” (COGS) can be used to predict the Rating.
data_tq <- data %>% group_by(cogs,Rating)
data_tq %>%
ggplot() +
geom_point( aes(x = Rating, y=cogs)) +
ggtitle("Scatter Plot by 'Total Bill' and Rating")
lm_tq<- lm(data_tq$cogs ~ data_tq$Rating , data_tq)
plot(lm_tq)
summary(lm_tq)
##
## Call:
## lm(formula = data_tq$cogs ~ data_tq$Rating, data = data_tq)
##
## Residuals:
## Min 1Q Median 3Q Max
## -302.74 -189.18 -64.67 142.67 690.79
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 342.211 30.954 11.056 <2e-16 ***
## data_tq$Rating -4.966 4.310 -1.152 0.25
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 234.1 on 998 degrees of freedom
## Multiple R-squared: 0.001328, Adjusted R-squared: 0.0003273
## F-statistic: 1.327 on 1 and 998 DF, p-value: 0.2496
Formula: Predict Quantity by Hour of the day.
data_tq <- data %>% group_by(`Product line`,hour(Time)) %>% summarise(Quantity= sum(Quantity))
data_tq%>%
ggplot() +
geom_point( aes(x = `hour(Time)`, y=Quantity)) +
ggtitle("Scatter Plot by Hour and Quantity")
lm_tq<- lm(data_tq$Quantity~data_tq$`hour(Time)` , data_tq)
plot(lm_tq)
summary(lm_tq)
##
## Call:
## lm(formula = data_tq$Quantity ~ data_tq$`hour(Time)`, data = data_tq)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.212 -19.798 1.879 14.979 57.933
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96.3030 14.1318 6.815 3.93e-09 ***
## data_tq$`hour(Time)` -0.8545 0.9219 -0.927 0.357
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.68 on 64 degrees of freedom
## Multiple R-squared: 0.01325, Adjusted R-squared: -0.002169
## F-statistic: 0.8593 on 1 and 64 DF, p-value: 0.3574
Using Minutes of time and trying to predict Average Rating
data_tq <- data %>% group_by(minute(Time)) %>% summarise(Rating= mean(Rating))
m4<-data_tq%>%
ggplot() +
geom_point( aes(x = `minute(Time)`, y=Rating)) +
ggtitle("Scatter Plot by Minutes and Rating")
m4
lm1<- lm( Rating ~`minute(Time)` ,data_tq)
summary(lm1)
##
## Call:
## lm(formula = Rating ~ `minute(Time)`, data = data_tq)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.17606 -0.15542 -0.01735 0.17895 1.26451
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.820002 0.108800 62.684 <2e-16 ***
## `minute(Time)` 0.004564 0.003181 1.435 0.157
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4267 on 58 degrees of freedom
## Multiple R-squared: 0.03429, Adjusted R-squared: 0.01764
## F-statistic: 2.059 on 1 and 58 DF, p-value: 0.1566
plot(lm1)
r4 <- plot_ss(x = data_tq$Rating, y = lm1$residuals,showSquares = TRUE)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -6.7162 0.9657
##
## Sum of Squares: 0.362
To assess whether the linear model is reliable, we need to check for
1. linearity,
2. nearly normal residuals, and
3. constant variability.
4. Residuals are independent
Residual = Observed value - Predicted value
plot_ss(x = data_tq$Rating, y = lm1$residuals,showSquares = TRUE)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -6.7162 0.9657
##
## Sum of Squares: 0.362
- Linear association: The residuals plot shows a random scatter. Based on the plot we can clearly say that there is apparent pattern in the distribution as the numbers appear to be group and close to the regression line, so it can be considered as a linear relationship.
- Nearly normal residuals: To check this condition, we can look at a histogram
hist(lm1$residuals)
- The QQ Plot for Model4, we can say that its also Nearly normal residuals even though its right skewed with few outliers
- Residuals can be treated as independent as data was collected at random .
Model with Regression line.
ggplot(data = data_tq,mapping = aes(y=data_tq$Rating ,x= data_tq$`minute(Time)`))+
geom_point() +
geom_smooth(method = "lm",se=FALSE, color="green")