Load in Libraries

library(tidyverse)
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tree)
## Warning: package 'tree' was built under R version 4.4.3

Load Data

STHQ = read.csv("STHQ_cleaned_4.25.25.csv")
names(STHQ)
## [1] "Company"       "Location"      "Status"        "Tangible"     
## [5] "Sport"         "International" "BTB"

Column Meanings / Documentation

# create data frame for all companies who have signed or rejected MOU
df = STHQ |>
  filter(Status != 2) |>
  droplevels() |>
  as.data.frame()
#df = as.data.frame(df)
declined = STHQ |>
  filter(Status == 0)
accepted = STHQ |>
  filter(Status == 1)
nrow(declined)
## [1] 13
nrow(accepted)
## [1] 33

Odds Ratios

For purposes of making my life easy, I will make all odds ratios >1. All this would really do is flip flop the base. Since there isn’t any meaningful reason to chose one variable over another, but explanation makes it easier when odds > 1, I’m going to do that.

Tangible

tangible.table = table(Status = df$Status, Tangible = df$Tangible)
tangible.1.odds = tangible.table[2,2] / tangible.table[1,2]
tangible.0.odds = tangible.table[2,1] / tangible.table[1,1]
tangible.odds = tangible.0.odds / tangible.1.odds

print(tangible.table)
##       Tangible
## Status  0  1
##      0  8  5
##      1 24  9
cat("Odds intangible v tangible:", tangible.odds, '\n')
## Odds intangible v tangible: 1.666667
cat("Odds intangible:", tangible.0.odds, '\n')
## Odds intangible: 3
cat("Odds tangible:", tangible.1.odds, '\n')
## Odds tangible: 1.8

Sport

sport.table = table(Status = df$Status, Sport = df$Sport)
sport.1.odds = sport.table[2,2] / sport.table[1,2]
sport.0.odds = sport.table[2,1] / sport.table[1,1]
sport.odds = sport.0.odds / sport.1.odds

print(sport.table)
##       Sport
## Status  0  1
##      0  2 11
##      1 14 19
cat("Odds non-sport v sport:", sport.odds, '\n')
## Odds non-sport v sport: 4.052632
cat("Odds non-sport:", sport.0.odds, '\n')
## Odds non-sport: 7
cat("Odds sport:", sport.1.odds, '\n')
## Odds sport: 1.727273

International

int.table = table(Status = df$Status, International = df$International)
int.1.odds = int.table[2,2] / int.table[1,2]
int.0.odds = int.table[2,1] / int.table[1,1]
int.odds = int.1.odds / int.0.odds

print(int.table)
##       International
## Status  0  1
##      0 10  3
##      1 24  9
cat("Odds international v domestic:", int.odds, '\n')
## Odds international v domestic: 1.25
cat("Odds international:", int.1.odds, '\n')
## Odds international: 3
cat("Odds domestic:", int.0.odds, '\n')
## Odds domestic: 2.4

BTB

btb.table = table(Status = df$Status, B2B = df$BTB)
btb.1.odds = btb.table[2,2] / btb.table[1,2]
btc.0.odds = btb.table[2,1] / btb.table[1,1]
btc.odds = btc.0.odds / btb.1.odds

print(btb.table)
##       B2B
## Status  0  1
##      0  6  7
##      1 22 11
cat("Odds B2C v B2B:", btc.odds, '\n')
## Odds B2C v B2B: 2.333333
cat("Odds B2B:", btb.1.odds, '\n')
## Odds B2B: 1.571429
cat("Odds B2C:", btc.0.odds, '\n')
## Odds B2C: 3.666667

Bootstrapping

This is so we can gain evidence that the data we have would be significant.

How this will work

# boot function
boot.sample = function(data, column.name) {
  samp.data = data[sample(nrow(data), replace=TRUE), ]
  
  table(Status = samp.data$Status, GoodType = samp.data[[column.name]])
}
n = nrow(df)

Tangible

set.seed(1)
# get bootstrap results
boot.results = replicate(1000, boot.sample(df, "Tangible"), simplify='array')

tang = boot.results[2,1,] / (boot.results[1,1,] + boot.results[2,1,])
intang = boot.results[2,2,] / (boot.results[1,2,] + boot.results[2,2,])
tangible.difference = intang - tang

hist(tangible.difference)

tangible.CI = sum(tangible.difference < 0) / 1000
cat("Intangible v Tangible CI:", tangible.CI)
## Intangible v Tangible CI: 0.768

Sport

set.seed(1)
# get bootstrap results
boot.results = replicate(1000, boot.sample(df, "Sport"), simplify='array')

sport = boot.results[2,1,] / (boot.results[1,1,] + boot.results[2,1,])
non.sport = boot.results[2,2,] / (boot.results[1,2,] + boot.results[2,2,])
sport.difference = non.sport - sport

hist(sport.difference)

sport.CI = sum(sport.difference < 0) / 1000
cat("Nonsport v Sport CI:", sport.CI)
## Nonsport v Sport CI: 0.97

International

set.seed(1)
# get bootstrap results
boot.results = replicate(1000, boot.sample(df, "International"), simplify='array')

int = boot.results[2,1,] / (boot.results[1,1,] + boot.results[2,1,])
domestic = boot.results[2,2,] / (boot.results[1,2,] + boot.results[2,2,])
international.difference = int - domestic

hist(international.difference)

international.CI = sum(international.difference < 0) / 1000
cat("International v Domestic CI:", international.CI)
## International v Domestic CI: 0.613

BTB

set.seed(1)
# get bootstrap results
btb.results = replicate(1000, boot.sample(df, "BTB"), simplify='array')

btc = boot.results[2,1,] / (boot.results[1,1,] + boot.results[2,1,])
btb = boot.results[2,2,] / (boot.results[1,2,] + boot.results[2,2,])
btb.difference = btb - btc

hist(btb.difference)

btb.CI = sum(btb.difference < 0) / 1000
cat("B2C v B2B CI:", btb.CI)
## B2C v B2B CI: 0.387

Trees

# tidyverse breaks down for tree.cv
# don't ask me why, but it is what it is
# and this below works
df2 = STHQ[STHQ$Status != 2, ]

I tried doing logistic regression but didn’t get much out of it. I’m going to stick with a basic regression tree and use that as an example for Wyatt to understand. I think it should be solid.

tree.pred = tree(Status ~ Sport + Tangible + BTB + International, data = df2)
summary(tree.pred)
## 
## Regression tree:
## tree(formula = Status ~ Sport + Tangible + BTB + International, 
##     data = df2)
## Variables actually used in tree construction:
## [1] "Sport" "BTB"  
## Number of terminal nodes:  3 
## Residual mean deviance:  0.1898 = 8.161 / 43 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.8750 -0.4545  0.1250  0.0000  0.2632  0.5455

Interestingly International and BTB wasn’t even used, affirming based on bootstrapped evidence that they aren’t very important in prediction.

plot(tree.pred)
text(tree.pred, pretty = 0)

Pretty simple tree… but that’s good for this situation!

Cross Validation

cv.STHQ = cv.tree(tree.pred)
plot(cv.STHQ$size, cv.STHQ$dev, type = "b")

The best is a size of 3, which is what we had before. This is pretty simple. Interestingly, it also suggests that whether the product was tangible or not isn’t relevant.

prune.STHQ <- prune.tree(tree.pred, best = 3)
plot(prune.STHQ)
text(tree.pred, pretty = 0)

Results from tree

pred = predict(tree.pred, newdata = df2)
tree.rmse = sqrt(mean((df2$Status - pred)^2))
cat("RMSE of Tree:", tree.rmse)
## RMSE of Tree: 0.4212167

Only a bit better than random guessing… that’s not great.

Classification?

RMSE might not be the best to use here. Maybe we’ll look at classification?

df2$Status.f = factor(df2$Status)
tree.pred.f = tree(Status.f ~ Sport + Tangible + BTB + International, data = df2)
summary(tree.pred.f)
## 
## Classification tree:
## tree(formula = Status.f ~ Sport + Tangible + BTB + International, 
##     data = df2)
## Variables actually used in tree construction:
## [1] "Sport" "BTB"  
## Number of terminal nodes:  3 
## Residual mean deviance:  1.142 = 49.12 / 43 
## Misclassification error rate: 0.2609 = 12 / 46

A miss-classification error of 0.26 isn’t too bad, although the overfitting can be an issue. We went into this expecting overfitting to be risk though.

plot(tree.pred.f)
text(tree.pred.f, pretty = 0)

Exact same as before.

Comparison

all.yes = nrow(declined) / nrow(df2)
cat("Classification error if only guessing yes:", all.yes)
## Classification error if only guessing yes: 0.2826087