##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.3 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Superstore_data=read.csv("SampleSuperstore_final.csv")
head(Superstore_data)
## Ship.Mode Segment Country City State Postal.Code
## 1 Second Class Consumer United States Henderson Kentucky 42420
## 2 Second Class Consumer United States Henderson Kentucky 42420
## 3 Second Class Corporate United States Los Angeles California 90036
## 4 Standard Class Consumer United States Fort Lauderdale Florida 33311
## 5 Standard Class Consumer United States Fort Lauderdale Florida 33311
## 6 Standard Class Consumer United States Los Angeles California 90032
## Region Category Sub.Category Sales Quantity Discount Profit
## 1 South Furniture Bookcases 261.9600 2 0.00 41.9136
## 2 South Furniture Chairs 731.9400 3 0.00 219.5820
## 3 West Office Supplies Labels 14.6200 2 0.00 6.8714
## 4 South Furniture Tables 957.5775 5 0.45 -383.0310
## 5 South Office Supplies Storage 22.3680 2 0.20 2.5164
## 6 West Furniture Furnishings 48.8600 7 0.00 14.1694
1. Part 1 - Build a linear (or generalized linear) model as you like
Sup_data_filtered <- Superstore_data |>
filter(Profit >0, Region == 'West', Segment== 'Home Office')
sup_grouped <-
Superstore_data |>
filter(Profit >0, Region == 'West', Segment== 'Home Office') |>
group_by(Region) |>
summarise(mean_profit = mean(Profit))
sup_grouped
## # A tibble: 1 × 2
## Region mean_profit
## <chr> <dbl>
## 1 West 47.1
Sup_data_filtered |>
ggplot() +
geom_point(mapping = aes( x= Sales, y = Profit)) +
geom_hline(data = sup_grouped,
mapping = aes(yintercept = mean_profit),
color = 'darkorange', linetype = 'dashed') +
labs(title = "Profit by Sales Price for Western Region - Segment : Home Office",
x = "Sales", y = "Profit") +
theme_minimal()
model <- lm( Profit ~ Sales, data = Sup_data_filtered)
model$coefficients
## (Intercept) Sales
## 0.1877567 0.1970041
Sup_data_filtered |>
ggplot(mapping = aes( x= Sales, y = Profit)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE, linewidth = 0.5) +
labs(title = "Profit by Sales Price for Western Region",
x = "Sales", y = "Profit") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Since the data is like a cone in shape will try to make use of transformation to Response variable i.e. Profit.
Sup_data_filtered <- Sup_data_filtered |>
mutate(log_profit_square = (log(Profit))**2)
Sup_data_filtered |>
ggplot(mapping = aes(x = Sales, y = log_profit_square)) +
geom_point(shape = 'O', size = 3) +
labs(title = "Log(Price)^2 vs. Sales Price for Western Region") +
theme_minimal()
model <- glm( log_profit_square ~ Sales, data = Sup_data_filtered)
model$coefficients
## (Intercept) Sales
## 6.2522707 0.0127788
Sup_data_filtered |>
ggplot(mapping = aes(x = Sales, y = log_profit_square)) +
geom_smooth(method = 'glm', se = FALSE, linewidth = 0.5) +
geom_point(shape = 'O', size = 3) +
labs(title = "Log(Price)^2 vs. Sales Price for Western Region") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Part 2: Use the tools from previous weeks to diagnose the model.Highlight any issues with the model.
# Assess model fit
summary(model)
##
## Call:
## glm(formula = log_profit_square ~ Sales, data = Sup_data_filtered)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.2522707 0.2659445 23.51 <2e-16 ***
## Sales 0.0127788 0.0004497 28.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 30.34772)
##
## Null deviance: 39984 on 511 degrees of freedom
## Residual deviance: 15477 on 510 degrees of freedom
## AIC: 3204.3
##
## Number of Fisher Scoring iterations: 2
Part 3: - Interpret at least one of the coefficients
coef(model)
## (Intercept) Sales
## 6.2522707 0.0127788
We can say that: Intercept is the base value of Profit when there is not much sales for the Home Office segment of products within th western region. In our case, where the sales remain as it is i.e. irrespective of it, the Profit increases by 6.25227.
For each unit increase in the sales, the log-odds of the “Profit” variable increases by approximately 0.0127788.