##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.3 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Superstore_data=read.csv("SampleSuperstore_final.csv")
head(Superstore_data)
## Ship.Mode Segment Country City State Postal.Code
## 1 Second Class Consumer United States Henderson Kentucky 42420
## 2 Second Class Consumer United States Henderson Kentucky 42420
## 3 Second Class Corporate United States Los Angeles California 90036
## 4 Standard Class Consumer United States Fort Lauderdale Florida 33311
## 5 Standard Class Consumer United States Fort Lauderdale Florida 33311
## 6 Standard Class Consumer United States Los Angeles California 90032
## Region Category Sub.Category Sales Quantity Discount Profit
## 1 South Furniture Bookcases 261.9600 2 0.00 41.9136
## 2 South Furniture Chairs 731.9400 3 0.00 219.5820
## 3 West Office Supplies Labels 14.6200 2 0.00 6.8714
## 4 South Furniture Tables 957.5775 5 0.45 -383.0310
## 5 South Office Supplies Storage 22.3680 2 0.20 2.5164
## 6 West Furniture Furnishings 48.8600 7 0.00 14.1694
1. Part 1 - Select an interesting binary column of data, or one which can be reasonably converted into a binary variable.
count_segment <- Superstore_data |> group_by(Segment) |>
summarise(Segment_count=n(),
.groups = 'drop') |> arrange(desc(Segment_count))
tail(count_segment, 10)
## # A tibble: 3 × 2
## Segment Segment_count
## <chr> <int>
## 1 Consumer 5191
## 2 Corporate 3020
## 3 Home Office 1783
Superstore_binary <- Superstore_data |>
mutate(in_consumer = case_when(Segment == 'Consumer' ~ 1,
Segment == 'Corporate' ~ 0,
Segment == 'Home Office' ~ 0))
Superstore_binary |> select("Segment","in_consumer","Sales","Quantity","Discount","Profit","Quantity")|> sample_n(10)
## Segment in_consumer Sales Quantity Discount Profit
## 1 Home Office 0 89.520 4 0.0 42.0744
## 2 Home Office 0 59.940 3 0.0 28.1718
## 3 Corporate 0 105.520 5 0.2 34.2940
## 4 Consumer 1 144.784 1 0.2 10.8588
## 5 Consumer 1 114.200 5 0.0 52.5320
## 6 Consumer 1 25.120 5 0.2 7.8500
## 7 Consumer 1 36.624 3 0.2 13.7340
## 8 Home Office 0 272.736 3 0.2 -64.7748
## 9 Consumer 1 53.088 7 0.8 -108.8304
## 10 Consumer 1 7.872 3 0.2 0.5904
Superstore_binary <- Superstore_binary |>
mutate(in_category = case_when(Category == 'Office Supplies' ~ 0,
Category == 'Furniture' ~ 1,
Category == 'Technology' ~ 1))|>
mutate(region_sep = case_when(Region == 'East' ~ 0,
Region == 'Central' ~ 0,
Region == 'West' ~ 1,
Region == 'South' ~ 1))
Superstore_binary |> select("Segment","in_consumer","Category","in_category","Region","region_sep","Sales","Quantity","Discount","Profit","Quantity")|> sample_n(10)
## Segment in_consumer Category in_category Region region_sep
## 1 Home Office 0 Office Supplies 0 West 1
## 2 Consumer 1 Office Supplies 0 West 1
## 3 Consumer 1 Office Supplies 0 East 0
## 4 Consumer 1 Office Supplies 0 East 0
## 5 Consumer 1 Office Supplies 0 West 1
## 6 Home Office 0 Furniture 1 East 0
## 7 Corporate 0 Furniture 1 West 1
## 8 Corporate 0 Office Supplies 0 West 1
## 9 Consumer 1 Office Supplies 0 Central 0
## 10 Corporate 0 Furniture 1 South 1
## Sales Quantity Discount Profit
## 1 12.840 3 0.0 3.4668
## 2 8.544 2 0.2 2.8836
## 3 12.960 2 0.0 6.2208
## 4 25.920 5 0.2 3.8880
## 5 1219.960 5 0.2 381.2375
## 6 352.450 5 0.5 -211.4700
## 7 63.936 3 0.2 6.3936
## 8 9.584 1 0.2 3.3544
## 9 338.040 3 0.2 -33.8040
## 10 15.992 1 0.2 0.9995
Part 2 - Build a logistic regression model for this variable, using between 1-4 explanatory variables.
Superstore_binary |>
ggplot(mapping = aes(x = Sales, y = in_consumer)) +
geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3) +
geom_smooth(method = 'lm', se = FALSE) +
labs(title = "Modeling a Binary Response with OLS") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
model <- glm( in_consumer ~ Sales,data=Superstore_binary, family = binomial( link = 'logit'))
model$coefficients
## (Intercept) Sales
## 8.526779e-02 -3.299085e-05
\[ \hat{in\_consumer} = \log\left(\frac{p}{1 - p}\right) = 8.52677 \times e^-2 - 3.299085 \times e^-5 \times \texttt{sales} \]
So, for every increase in Sales, the odds that the segment is a consumer one is multiplied by by \(e^{+0.0221} = 1.022\), or for every increase in Sales, the odds of a segment being consumer goes down by about 12% (\(1 - 1.022 = -0.022\)). It suffices to say that e ^ coef is more interpretable than coef.
The (Intercept) represents the log-odds when all the feature values are equal to zero. This can be used to determine a 50%-probability “decision threshold” for any variable. The 50% probability is reached when \(\text{odds} = 1 \to \log(\text{odds}) = 0\). So, recalling the intercept and coefficient from above, we have
\[ \begin{align} 0 &= \log(\text{odds}) \newline \to \quad 0 &= \beta_0 + \beta_1 x_1 \newline &= 8.52677 \cdot e^-2 -3.299085 \cdot e^-5 \cdot \texttt{sales} \newline 1.1536 & = 0.0221 \cdot \texttt{sales} \newline \to \quad \texttt{sales} &= \frac{1.1536}{0.0221} = 52.1990 \end{align} \]
So, when the sales are roughly 52.1990 $ , there is a 50/50 odds that the Segment is consumer.
Now let’s look at how the sigmoid function can help us give insight into the likelihood that the product sold is a Consumer Segment.
# these coefficients come from the model
# e^-5 = 0.0067
# e^-2 = 0.1353
sigmoid <- \(x) 1 / (1 + exp(-(8.526*0.1353 - (3.299085*0.0067) * x)))
Superstore_binary |>
ggplot(mapping = aes(x = Sales, y = in_consumer)) +
geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3) +
geom_function(fun = sigmoid, color = 'blue', linewidth = 1) +
labs(title = "Modeling a Binary Response with Sigmoid") +
scale_y_continuous(breaks = c(0, 0.5, 1)) +
theme_minimal()
Superstore_binary |>
ggplot(mapping = aes(x = Sales, y = in_category)) +
geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3) +
geom_smooth(method = 'lm', se = FALSE) +
labs(title = "Modeling a Binary Response with OLS") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
model <- glm( in_category ~ Sales,data=Superstore_binary, family = binomial( link = 'logit'))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model$coefficients
## (Intercept) Sales
## -0.792845491 0.001918816
# these coefficients come from the model
sigmoid <- \(x) 1 / (1 + exp(-(-0.792 + 0.00191 * x)))
Superstore_binary |>
ggplot(mapping = aes(x = Sales, y = in_category)) +
geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3) +
geom_function(fun = sigmoid, color = 'blue', linewidth = 1) +
labs(title = "Modeling a Binary Response with Sigmoid") +
scale_y_continuous(breaks = c(0, 0.5, 1)) +
theme_minimal()
3. Part 3 - Consider a transformation for any explanatory variable, and illustrate why you need the transformation (or why you do not)
Lets consider the - - Response variable = Profit - Explanatory Variable = Sales
Superstore_binary |>
filter(Profit >=0)|>
ggplot(mapping = aes(x = Profit)) +
geom_histogram(color = 'white') +
labs("Histogram of Profit ") +
theme_hc()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
For example, Profit (or, positive dollar amounts in general) typically
resembles slightly a Poisson distribution. Therefore we further try to
do a log over Profit to view the data better. Also, trying to check only
on profits and not losses hence filter condition of Profit >= 0.
Superstore_binary |>
filter(Profit >=0) |>
ggplot(mapping = aes(x = Sales, y = log(Profit))) +
geom_point(shape = 'O', size = 3) +
geom_smooth(se = FALSE) +
labs(title = "Log(Price) vs. Sales") +
theme_minimal()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 65 rows containing non-finite values (`stat_smooth()`).
model <- lm(Profit ~ Sales,
filter(Superstore_binary, Profit >=0))
rsquared <- summary(model)$r.squared
Superstore_binary |>
filter(Profit >=0) |>
ggplot(mapping = aes(x = Sales,
y = Profit)) +
geom_point() +
geom_smooth(method = 'lm', color = 'gray', linetype = 'dashed',
se = FALSE) +
geom_smooth(se = FALSE) +
labs(title = "Profit vs. Sales",
subtitle = paste("Linear Fit R-Squared =", round(rsquared, 3))) +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
There is a clear monotonic relationship between \(x\) (sales) and \(y\) (profit) here, “as \(x\) increases, \(y\) increases.” In this case the relationship is quadratic, so \(y\) could be modeled non-linearly as= \[ y = ax^2 + bx + c + \varepsilon \]
From the plot, we could also determine that the there is a quadratic relation between (x)Sales and (y)Profit. Further transforming the (sales)explanatory variable to create an effective regressive model and plotting the same.
Superstore_binary_filt <- Superstore_binary |>
mutate(sales_sqft_2 = Sales ^ 2) # add new variable
model <- lm(Profit ~ sales_sqft_2 + Sales,
filter(Superstore_binary_filt))
rsquared <- summary(model)$r.squared
Superstore_binary_filt |>
filter(Profit >=1) |>
ggplot(mapping = aes(x = sales_sqft_2,
y = Profit)) +
geom_point() +
geom_smooth(method = 'lm', color = 'gray', linetype = 'dashed',
se = FALSE) +
geom_smooth(se = FALSE) +
labs(title = " Profit vs. (Sales) ^ 2",
subtitle = paste("Linear Fit R-Squared =", round(rsquared, 3))) +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
The above plot of \(y\) vs. \(x\) determines the transformation on \(x\).i.e. it looks like \(y \approx \sqrt{x}\), hence we created a new variable in our data frame like sales_sqft_2 = sqrt(sales). Not much with respect to data has changed but the line is trying to be a linear one.