library(tidyverse)
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tree)
## Warning: package 'tree' was built under R version 4.4.3
STHQ = read.csv("STHQ_cleaned_4.25.25.csv")
names(STHQ)
## [1] "Company" "Location" "Status" "Tangible"
## [5] "Sport" "International" "BTB"
Column Meanings / Documentation
Company
Location
Status
0: denied MOU
1: accepted MOU
2: in progress
Tangible
0: primarly sells intangible goods
1: primarily sells tangible goods
Sport
0: business does not exclusively rely on existence of sporting teams or athletes
1: business exclusively relies on existence of sporting teams or athletes
BTB
0: primarily B2C
1: primarily B2B
# create data frame for all companies who have signed or rejected MOU
df = STHQ |>
filter(Status != 2) |>
droplevels() |>
as.data.frame()
#df = as.data.frame(df)
declined = STHQ |>
filter(Status == 0)
accepted = STHQ |>
filter(Status == 1)
nrow(declined)
## [1] 13
nrow(accepted)
## [1] 33
For purposes of making my life easy, I will make all odds ratios >1. All this would really do is flip flop the base. Since there isn’t any meaningful reason to chose one variable over another, but explanation makes it easier when odds > 1, I’m going to do that.
tangible.table = table(Status = df$Status, Tangible = df$Tangible)
tangible.1.odds = tangible.table[2,2] / tangible.table[1,2]
tangible.0.odds = tangible.table[2,1] / tangible.table[1,1]
tangible.odds = tangible.0.odds / tangible.1.odds
print(tangible.table)
## Tangible
## Status 0 1
## 0 8 5
## 1 24 9
cat("Odds intangible v tangible:", tangible.odds, '\n')
## Odds intangible v tangible: 1.666667
cat("Odds intangible:", tangible.0.odds, '\n')
## Odds intangible: 3
cat("Odds tangible:", tangible.1.odds, '\n')
## Odds tangible: 1.8
sport.table = table(Status = df$Status, Sport = df$Sport)
sport.1.odds = sport.table[2,2] / sport.table[1,2]
sport.0.odds = sport.table[2,1] / sport.table[1,1]
sport.odds = sport.0.odds / sport.1.odds
print(sport.table)
## Sport
## Status 0 1
## 0 2 11
## 1 14 19
cat("Odds non-sport v sport:", sport.odds, '\n')
## Odds non-sport v sport: 4.052632
cat("Odds non-sport:", sport.0.odds, '\n')
## Odds non-sport: 7
cat("Odds sport:", sport.1.odds, '\n')
## Odds sport: 1.727273
int.table = table(Status = df$Status, International = df$International)
int.1.odds = int.table[2,2] / int.table[1,2]
int.0.odds = int.table[2,1] / int.table[1,1]
int.odds = int.1.odds / int.0.odds
print(int.table)
## International
## Status 0 1
## 0 10 3
## 1 24 9
cat("Odds international v domestic:", int.odds, '\n')
## Odds international v domestic: 1.25
cat("Odds international:", int.1.odds, '\n')
## Odds international: 3
cat("Odds domestic:", int.0.odds, '\n')
## Odds domestic: 2.4
btb.table = table(Status = df$Status, B2B = df$BTB)
btb.1.odds = btb.table[2,2] / btb.table[1,2]
btc.0.odds = btb.table[2,1] / btb.table[1,1]
btc.odds = btc.0.odds / btb.1.odds
print(btb.table)
## B2B
## Status 0 1
## 0 6 7
## 1 22 11
cat("Odds B2C v B2B:", btc.odds, '\n')
## Odds B2C v B2B: 2.333333
cat("Odds B2B:", btb.1.odds, '\n')
## Odds B2B: 1.571429
cat("Odds B2C:", btc.0.odds, '\n')
## Odds B2C: 3.666667
This is so we can gain evidence that the data we have would be significant.
How this will work
Perform bootstrapping on the existing data in df
Perform 1-tail test to see how many bootstrapping re-samples result in >0 or <0
Use the proportion gathered as CI interval which would be acceptable
Since we are using small data in a business setting, Wyatt may not care about reaching a 95%+ CI
Instead, we’ll give the level of confidence that would garner rejecting the null hypothesis, and we’ll let Wyatt determine what confidence level he is okay with. He can accept all those > his level, and reject the rest.
# boot function
boot.sample = function(data, column.name) {
samp.data = data[sample(nrow(data), replace=TRUE), ]
table(Status = samp.data$Status, GoodType = samp.data[[column.name]])
}
n = nrow(df)
set.seed(1)
# get bootstrap results
boot.results = replicate(1000, boot.sample(df, "Tangible"), simplify='array')
tang = boot.results[2,1,] / (boot.results[1,1,] + boot.results[2,1,])
intang = boot.results[2,2,] / (boot.results[1,2,] + boot.results[2,2,])
tangible.difference = intang - tang
hist(tangible.difference)
tangible.CI = sum(tangible.difference < 0) / 1000
cat("Intangible v Tangible CI:", tangible.CI)
## Intangible v Tangible CI: 0.768
set.seed(1)
# get bootstrap results
boot.results = replicate(1000, boot.sample(df, "Sport"), simplify='array')
sport = boot.results[2,1,] / (boot.results[1,1,] + boot.results[2,1,])
non.sport = boot.results[2,2,] / (boot.results[1,2,] + boot.results[2,2,])
sport.difference = non.sport - sport
hist(sport.difference)
sport.CI = sum(sport.difference < 0) / 1000
cat("Nonsport v Sport CI:", sport.CI)
## Nonsport v Sport CI: 0.97
set.seed(1)
# get bootstrap results
boot.results = replicate(1000, boot.sample(df, "International"), simplify='array')
int = boot.results[2,1,] / (boot.results[1,1,] + boot.results[2,1,])
domestic = boot.results[2,2,] / (boot.results[1,2,] + boot.results[2,2,])
international.difference = int - domestic
hist(international.difference)
international.CI = sum(international.difference < 0) / 1000
cat("International v Domestic CI:", international.CI)
## International v Domestic CI: 0.613
set.seed(1)
# get bootstrap results
btb.results = replicate(1000, boot.sample(df, "BTB"), simplify='array')
btc = boot.results[2,1,] / (boot.results[1,1,] + boot.results[2,1,])
btb = boot.results[2,2,] / (boot.results[1,2,] + boot.results[2,2,])
btb.difference = btb - btc
hist(btb.difference)
btb.CI = sum(btb.difference < 0) / 1000
cat("B2C v B2B CI:", btb.CI)
## B2C v B2B CI: 0.387
# tidyverse breaks down for tree.cv
# don't ask me why, but it is what it is
# and this below works
df2 = STHQ[STHQ$Status != 2, ]
I tried doing logistic regression but didn’t get much out of it. I’m going to stick with a basic regression tree and use that as an example for Wyatt to understand. I think it should be solid.
tree.pred = tree(Status ~ Sport + Tangible + BTB + International, data = df2)
summary(tree.pred)
##
## Regression tree:
## tree(formula = Status ~ Sport + Tangible + BTB + International,
## data = df2)
## Variables actually used in tree construction:
## [1] "Sport" "BTB"
## Number of terminal nodes: 3
## Residual mean deviance: 0.1898 = 8.161 / 43
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.8750 -0.4545 0.1250 0.0000 0.2632 0.5455
Interestingly International and BTB wasn’t even used, affirming based on bootstrapped evidence that they aren’t very important in prediction.
plot(tree.pred)
text(tree.pred, pretty = 0)
Pretty simple tree… but that’s good for this situation!
cv.STHQ = cv.tree(tree.pred)
plot(cv.STHQ$size, cv.STHQ$dev, type = "b")
The best is a size of 3, which is what we had before. This is pretty simple. Interestingly, it also suggests that whether the product was tangible or not isn’t relevant.
prune.STHQ <- prune.tree(tree.pred, best = 3)
plot(prune.STHQ)
text(tree.pred, pretty = 0)
pred = predict(tree.pred, newdata = df2)
tree.rmse = sqrt(mean((df2$Status - pred)^2))
cat("RMSE of Tree:", tree.rmse)
## RMSE of Tree: 0.4212167
Only a bit better than random guessing… that’s not great.
RMSE might not be the best to use here. Maybe we’ll look at classification?
df2$Status.f = factor(df2$Status)
tree.pred.f = tree(Status.f ~ Sport + Tangible + BTB + International, data = df2)
summary(tree.pred.f)
##
## Classification tree:
## tree(formula = Status.f ~ Sport + Tangible + BTB + International,
## data = df2)
## Variables actually used in tree construction:
## [1] "Sport" "BTB"
## Number of terminal nodes: 3
## Residual mean deviance: 1.142 = 49.12 / 43
## Misclassification error rate: 0.2609 = 12 / 46
A miss-classification error of 0.26 isn’t too bad, although the overfitting can be an issue. We went into this expecting overfitting to be risk though.
plot(tree.pred.f)
text(tree.pred.f, pretty = 0)
Exact same as before.
all.yes = nrow(declined) / nrow(df2)
cat("Classification error if only guessing yes:", all.yes)
## Classification error if only guessing yes: 0.2826087