TPO Heat Map Analysis

TPO Heat Map Analysis

Ian Kennedy
2022-06-08

1 Distribution Plots

1.1 Full Sample

Show code
TPOSampleJoined %>%
  ggplot(aes(LOG_TOT_MCF)) +
  geom_density() +
  scale_x_continuous(limits = c(-2,5), breaks = c(seq(-2,5,1))) +
  scale_y_continuous(limits = c, breaks = c(seq(0,.7,.1))) +
  theme_fivethirtyeight(base_size = 12, base_family = "serif") +
  theme(axis.title = element_text(family = 'serif', size = 14)) +
  labs(title = "Logged Volume Distribution", x= "Log(10) MCF",y = "Density") 
Show code
TPOSampleJoined %>%
  ggplot(aes(Pro_Rad_SqRt))+
  geom_density() + 
  scale_x_continuous(limits = c(0, 15), breaks = c(seq(0,15,1))) +
  scale_y_continuous(limits = c(0,.25), breaks = c(seq(0,.25,.05))) +
  theme_fivethirtyeight(base_size = 12, base_family = "serif") +
  theme(axis.title = element_text(family = 'serif', size = 14)) +
  labs(title = "SqRt Radius Distribution", x= "SqRt Procurement Radius",y = "Density")

Show code
TPOSampleJoined <- TPOSampleJoined %>%
  filter(TOT_MCF < 16000 & !is.na(TOT_MCF)) %>%
  mutate(MILL_COUNTYCD = as.character(MILL_COUNTYCD),
         MILL_STATECD = as.character(MILL_STATECD),
         MTC = as.character(MTC))

#Remove Outliers from Joined Sample, store as 'UnusedFinal'...These mills provided a Pro Rad during 2019-2021 and can be joined back following model creation
Unused1 <- TPOSampleJoined %>%
  filter((Pro_Rad >= 200 | Pro_Rad <= 1))  

Unused2 <- TPOSampleJoined %>%            
  filter(MILL_STATE == "OH" & volt == "low")

Unused3 <- TPOSampleJoined %>%              
  filter(MILL_STATE == "MI" & Pro_Rad > 150) %>%
  filter(!(MILL_NAME %in% Unused1$MILL_NAME))

Unused4 <- TPOSampleJoined %>%
  filter(MILL_NAME == "ANTHONY & CO., INC.")

UnusedFinal <- Unused1 %>%
  bind_rows(Unused2, Unused3, Unused4)

rm(Unused1, Unused2, Unused3, Unused4)
  
TPOSampleJoined <- TPOSampleJoined %>%
  filter(!UnusedCheck %in% UnusedFinal$UnusedCheck)



TPOSampleJoined <- TPOSampleJoined %>%
  mutate(MTC = case_when(MTC == "10" ~ "Sawmill",
                         MILL_STATE == "OH" & MTC != "10" ~ "Sawmill",
                         MILL_STATE %in% c("MO", "SD") ~ "Sawmill",
                         MTC != "10" ~ "Others")) 

1.2 Model-Obervations Only

Show code
TPOSampleJoined %>%
  filter(!is.na(Pro_Rad_SqRt) & !is.na(volt) & !is.na(MTC) & !is.na(MILL_STATE) & !is.na(LOG_TOT_MCF)) %>%
  ggplot(aes(LOG_TOT_MCF)) +
  geom_density() +
  scale_x_continuous(limits = c(-2, 5), breaks = c(seq(-2,5,1))) +
  scale_y_continuous(limits = c, breaks = c(seq(0,.7,.1))) +
  theme_fivethirtyeight(base_size = 12, base_family = "serif") +
  theme(axis.title = element_text(family = 'serif', size = 14)) +
  labs(title = "Logged Volume Distribution", x= "Log(10) MCF",y = "Density") 
Show code
TPOSampleJoined %>%
  filter(!is.na(Pro_Rad_SqRt) & !is.na(volt) & !is.na(MTC) & !is.na(MILL_STATE) & !is.na(LOG_TOT_MCF)) %>%
  ggplot(aes(Pro_Rad_SqRt)) +
  geom_density() + 
  scale_x_continuous(limits = c(0, 15), breaks = c(seq(0,15,1))) +
  scale_y_continuous(limits = c(0,.25), breaks = c(seq(0,.25,.05))) +
  theme_fivethirtyeight(base_size = 12, base_family = "serif") +
  theme(axis.title = element_text(family = 'serif', size = 14)) +
  labs(title = "SqRt Radius Distribution", x= "SqRt Procurement Radius",y = "Density")

2 Model Creation & Summary

Show code
# RegionTest <- TPOSampleJoined %>%
#   mutate(Region = case_when(MILL_STATECD %in% c("25", "23", "44", "33", "50", "9") ~ "NewEngland",
#                             MILL_STATECD %in% c("36", "34", "24", "10") ~ "MidAtl",
#                             MILL_STATECD %in% c("54", "39", "17", "18", "19") ~ "Central",
#                             MILL_STATECD %in% c("26", "27", "55") ~ "Lake",
#                             MILL_STATECD %in% c("38", "46", "31", "20", "29") ~ "Plains",
#                             MILL_STATECD == "42" ~ "PA"))
# 
# TPOSampleJoined %>%
#   dplyr::count(MILL_STATECD, volt, sort = TRUE) 

Best_Model <- lm(TPOSampleJoined$Pro_Rad ~ TPOSampleJoined$TOT_MCF + TPOSampleJoined$MILL_STATE + TPOSampleJoined$volt + 0)
summary(Best_Model)

Call:
lm(formula = TPOSampleJoined$Pro_Rad ~ TPOSampleJoined$TOT_MCF + 
    TPOSampleJoined$MILL_STATE + TPOSampleJoined$volt + 0)

Residuals:
    Min      1Q  Median      3Q     Max 
-54.567 -20.960  -8.422  16.168 115.347 

Coefficients:
                                    Estimate  Std. Error t value
TPOSampleJoined$TOT_MCF            0.0019813   0.0009188   2.156
TPOSampleJoined$MILL_STATEIA      83.5345708   7.3284715  11.399
TPOSampleJoined$MILL_STATEIL      51.5768161   7.2353945   7.128
TPOSampleJoined$MILL_STATEIN      57.1340456   6.3606360   8.982
TPOSampleJoined$MILL_STATEMI      70.2488211   3.8198170  18.391
TPOSampleJoined$MILL_STATEMO      54.5661445   4.6418215  11.755
TPOSampleJoined$MILL_STATEOH      76.1253880   5.5276194  13.772
TPOSampleJoined$MILL_STATEOthers  57.6047963   2.7165284  21.205
TPOSampleJoined$MILL_STATESD      44.2959245  12.9745960   3.414
TPOSampleJoined$voltlow          -14.4914337   3.2123067  -4.511
                                             Pr(>|t|)    
TPOSampleJoined$TOT_MCF                      0.031515 *  
TPOSampleJoined$MILL_STATEIA     < 0.0000000000000002 ***
TPOSampleJoined$MILL_STATEIL         0.00000000000347 ***
TPOSampleJoined$MILL_STATEIN     < 0.0000000000000002 ***
TPOSampleJoined$MILL_STATEMI     < 0.0000000000000002 ***
TPOSampleJoined$MILL_STATEMO     < 0.0000000000000002 ***
TPOSampleJoined$MILL_STATEOH     < 0.0000000000000002 ***
TPOSampleJoined$MILL_STATEOthers < 0.0000000000000002 ***
TPOSampleJoined$MILL_STATESD                 0.000691 ***
TPOSampleJoined$voltlow              0.00000800083270 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 31.01 on 512 degrees of freedom
  (2390 observations deleted due to missingness)
Multiple R-squared:  0.7849,    Adjusted R-squared:  0.7807 
F-statistic: 186.8 on 10 and 512 DF,  p-value: < 0.00000000000000022
Show code
Test_Model <- lm(TPOSampleJoined$Pro_Rad ~ TPOSampleJoined$TOT_MCF + TPOSampleJoined$Region + TPOSampleJoined$volt + 0)
summary(Test_Model)

Call:
lm(formula = TPOSampleJoined$Pro_Rad ~ TPOSampleJoined$TOT_MCF + 
    TPOSampleJoined$Region + TPOSampleJoined$volt + 0)

Residuals:
    Min      1Q  Median      3Q     Max 
-59.156 -21.938  -7.141  12.928 112.833 

Coefficients:
                                    Estimate  Std. Error t value
TPOSampleJoined$TOT_MCF            0.0020795   0.0009503   2.188
TPOSampleJoined$RegionCentral     66.7768826   3.2582164  20.495
TPOSampleJoined$RegionLake        71.2798762   3.1258612  22.803
TPOSampleJoined$RegionMidAtl      54.6043875   3.2321220  16.894
TPOSampleJoined$RegionNewEngland  54.1118219   4.4552893  12.146
TPOSampleJoined$RegionPlains      51.6677796   4.0039339  12.904
TPOSampleJoined$voltlow          -14.6645805   3.1588927  -4.642
                                             Pr(>|t|)    
TPOSampleJoined$TOT_MCF                        0.0291 *  
TPOSampleJoined$RegionCentral    < 0.0000000000000002 ***
TPOSampleJoined$RegionLake       < 0.0000000000000002 ***
TPOSampleJoined$RegionMidAtl     < 0.0000000000000002 ***
TPOSampleJoined$RegionNewEngland < 0.0000000000000002 ***
TPOSampleJoined$RegionPlains     < 0.0000000000000002 ***
TPOSampleJoined$voltlow                    0.00000438 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 30.91 on 515 degrees of freedom
  (2390 observations deleted due to missingness)
Multiple R-squared:  0.7851,    Adjusted R-squared:  0.7821 
F-statistic: 268.7 on 7 and 515 DF,  p-value: < 0.00000000000000022
Show code
Test_Res <- mean(abs(residuals(Test_Model)))

# TPOSampleJoined$MILL_STATE <- as.character(TPOSampleJoined$MILL_STATE)

# GAM <- gam(data = TPOSampleJoined, formula = Pro_Rad ~ te(TOT_MCF + MILL_STATE + volt, sp = .2, k = 9))
# summary(GAM)
# gam.check(GAM)
# 
# plot.gam(GAM, residuals = TRUE, shade = TRUE, seWithMean = TRUE)
# 
# GAM_Res <- mean(abs(residuals.gam(GAM)))

3 Model/VIF Visualization

3.1 Preferred Model (Volume, State, Volume Classification)

Show code
TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & !is.na(volt) & !is.na(MTC)) %>%
  ggplot(aes(TOT_MCF, Pro_Rad)) +
    geom_point() +
    geom_smooth(se = FALSE, method = "lm") +
    # scale_x_continuous(limits = c(0, 16000), breaks = c(seq(0,16000,2000))) +
    scale_y_continuous(limits = c(0, 200), breaks = c(seq(0,200,25))) +
    facet_wrap(facets = vars(MILL_STATE, volt), scales = "free_x", nrow = 3) +
    theme_fivethirtyeight(base_size = 12, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 14)) +
    labs(title = "Linear Regression Test Plot", x = "MCF",y = "Procurement Radius") +
    stat_regline_equation(label.x = 1, label.y = 180, aes(label = ..rr.label..))
Show code
VIFScores <- vif(Best_Model) %>%
  as.data.frame(row.names = NULL)

VIFScores <- VIFScores %>%
  dplyr::rename("VIF" = `GVIF^(1/(2*Df))`)
VIFs <- as.data.frame(VIFScores$VIF)
VIFs <- cbind(VIFs, Variable = c("MCF", "State", "Volume Classificaion"))
VIFs <- VIFs %>%
  dplyr::rename('VIF' =`VIFScores$VIF`)

ggplot(VIFs, aes(Variable, VIF)) +
  geom_col(position = "identity") + 
  geom_hline(yintercept = 4) +
  geom_text(aes(label = round(VIF, digits = 2)), nudge_y = .12) +
  scale_y_continuous(limits = c(0,5), breaks = c(seq(0,5,1))) +
  theme_fivethirtyeight(base_size = 12, base_family = "serif") +
  theme(axis.title = element_text(family = 'serif', size = 14)) +
  labs(title = "Linear Regression Model - VIF Scores", y = "GVIF Value", x = "Variable")
Show code
rm(VIFScores, VIFs)

3.2 Fall-Back Model (Volume, Region, & Volume Classification)

Show code
TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & !is.na(volt) & !is.na(MTC)) %>%
  ggplot(aes(TOT_MCF, Pro_Rad)) +
    geom_point() +
    geom_smooth(se = FALSE, method = "lm") +
    # scale_x_continuous(limits = c(0, 16000), breaks = c(seq(0,16000,2000))) +
    scale_y_continuous(limits = c(0, 200), breaks = c(seq(0,200,25))) +
    facet_wrap(facets = vars(Region, volt), scales = "free_x", nrow = 3) +
    theme_fivethirtyeight(base_size = 12, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 14)) +
    labs(title = "Linear Regression Test Plot", x = "MCF",y = "Procurement Radius") +
    stat_regline_equation(label.x = 1, label.y = 180, aes(label = ..rr.label..))
Show code
VIFScores <- vif(Test_Model) %>%
  as.data.frame(row.names = NULL)

VIFScores <- VIFScores %>%
  dplyr::rename("VIF" = `GVIF^(1/(2*Df))`)
VIFs <- as.data.frame(VIFScores$VIF)
VIFs <- cbind(VIFs, Variable = c("MCF", "Region", "Volume Classificaion"))
VIFs <- VIFs %>%
  dplyr::rename('VIF' =`VIFScores$VIF`)

ggplot(VIFs, aes(Variable, VIF)) +
  geom_col(position = "identity") + 
  geom_hline(yintercept = 4) +
  geom_text(aes(label = round(VIF, digits = 2)), nudge_y = .12) +
  scale_y_continuous(limits = c(0,5), breaks = c(seq(0,5,1))) +
  theme_fivethirtyeight(base_size = 12, base_family = "serif") +
  theme(axis.title = element_text(family = 'serif', size = 14)) +
  labs(title = "Linear Regression Model - VIF Scores", y = "GVIF Value", x = "Variable")
Show code
# TPOSampleJoined <- TPOSampleJoined %>%
#   select(-c(BestPredictions, BestResiduals))

4 Chi-Squared Tests

Show code
TPOSampleJoined_Chi <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad_SqRt) & !is.na(volt) & !is.na(MILL_STATE) & !is.na(MTC) & !is.na(LOG_TOT_MCF)) 

# Chi Tests

Chi_Rad_Vol <- chisq.test(table(TPOSampleJoined_Chi$Pro_Rad, TPOSampleJoined_Chi$TOT_MCF))
Chi_Rad_State <- chisq.test(table(TPOSampleJoined_Chi$Pro_Rad, TPOSampleJoined_Chi$MILL_STATE))
Chi_Rad_Volt <- chisq.test(table(TPOSampleJoined_Chi$Pro_Rad, TPOSampleJoined_Chi$volt))
Chi_Rad_Region <- chisq.test(table(TPOSampleJoined_Chi$Pro_Rad, TPOSampleJoined_Chi$Region))

ChiStats <- c(Chi_Rad_Vol$statistic, Chi_Rad_State$statistic, Chi_Rad_Volt$statistic, Chi_Rad_Region$statistic)
ChiParams <- c(Chi_Rad_Vol$parameter, Chi_Rad_State$parameter, Chi_Rad_Volt$parameter, Chi_Rad_Region$parameter)
ChiPVals <- c(Chi_Rad_Vol$p.value, Chi_Rad_State$p.value, Chi_Rad_Volt$p.value, Chi_Rad_Region$p.value)

ChiJoined <- as.data.frame(cbind(Stats = ChiStats, Parameters = ChiParams, PValues = ChiPVals), row.names = c("Volume", "State", "Volume Classification", "Region")) %>%
  rownames_to_column(var = "Variable")

ggplot(ChiJoined, aes(PValues, Variable)) +
  geom_col(position = "identity") + 
  geom_hline(yintercept = .05) +
  geom_text(aes(label = round(PValues, digits = 10)), nudge_x = .005) +
  scale_x_continuous(limits = c(0,.2), breaks = c(seq(0,.2,.02))) +
  theme_fivethirtyeight(base_size = 12, base_family = "serif") +
  theme(axis.title = element_text(family = 'serif', size = 14)) +
  labs(title = "P-Value of Model Variables in Relation to Procurement Radius", y = "Variable", x = "P-Value") +
  coord_flip()

4.1 Sqrt(Procurement Radius) & Log(10) MCF

Show code
Chi_Rad_Vol

    Pearson's Chi-squared test

data:  table(TPOSampleJoined_Chi$Pro_Rad, TPOSampleJoined_Chi$TOT_MCF)
X-squared = 14754, df = 14250, p-value = 0.00155
Show code
Correlation <- cor(TPOSampleJoined_Chi$Pro_Rad, TPOSampleJoined_Chi$TOT_MCF)

print(paste0("Correlation: ", round(Correlation, digits = 4)))
[1] "Correlation: 0.1697"

4.2 Procurement Radius & State

Show code
Chi_Rad_State

    Pearson's Chi-squared test

data:  table(TPOSampleJoined_Chi$Pro_Rad, TPOSampleJoined_Chi$MILL_STATE)
X-squared = 293.39, df = 266, p-value = 0.1195

4.3 Procurement Radius & Region

Show code
Chi_Rad_Region

    Pearson's Chi-squared test

data:  table(TPOSampleJoined_Chi$Pro_Rad, TPOSampleJoined_Chi$Region)
X-squared = 175.04, df = 152, p-value = 0.09717

4.4 Procurement Radius & Volume Classification (High/Low)

Show code
Chi_Rad_Volt

    Pearson's Chi-squared test

data:  table(TPOSampleJoined_Chi$Pro_Rad, TPOSampleJoined_Chi$volt)
X-squared = 113.55, df = 38, p-value = 0.000000001874