TPO Heat Map Analysis
TPOSampleJoined %>%
ggplot(aes(LOG_TOT_MCF)) +
geom_density() +
scale_x_continuous(limits = c(-2,5), breaks = c(seq(-2,5,1))) +
scale_y_continuous(limits = c, breaks = c(seq(0,.7,.1))) +
theme_fivethirtyeight(base_size = 20, base_family = "serif") +
theme(axis.title = element_text(family = 'serif', size = 20)) +
labs(title = "Logged Volume Distribution", x= "Log(10) MCF",y = "Density")
TPOSampleJoined %>%
ggplot(aes(Pro_Rad_SqRt))+
geom_density() +
scale_x_continuous(limits = c(0, 15), breaks = c(seq(0,15,1))) +
scale_y_continuous(limits = c(0,.25), breaks = c(seq(0,.25,.05))) +
theme_fivethirtyeight(base_size = 20, base_family = "serif") +
theme(axis.title = element_text(family = 'serif', size = 20)) +
labs(title = "SqRt Radius Distribution", x= "SqRt Procurement Radius",y = "Density")
TPOSampleJoined <- TPOSampleJoined %>%
filter(TOT_MCF < 16000 & !is.na(TOT_MCF)) %>%
mutate(MILL_COUNTYCD = as.character(MILL_COUNTYCD),
MILL_STATECD = as.character(MILL_STATECD),
MTC = as.character(MTC))
#Remove Outliers from Joined Sample, store as 'UnusedFinal'...These mills provided a Pro Rad during 2019-2021 and can be joined back following model creation
##Remove those with radii >= 200 and <= 1
Unused1 <- TPOSampleJoined %>%
filter(Pro_Rad >= 200 | Pro_Rad <= 1 | Pro_Rad >= 150 & LOG_TOT_MCF < 0.5 & Region %in% c("Plains", "Lake"))
##Remove Lake entries with radii >= 100 & Log(10) MCF <= 0.5
# Unused2 <- TPOSampleJoined %>%
# filter(Region == "Lake" & LOG_TOT_MCF < 0.5 & Pro_Rad >= 100)
#
# Unused3 <- TPOSampleJoined %>%
# filter(MILL_STATE == "MI" & Pro_Rad > 150) %>%
# filter(!(MILL_NAME %in% Unused1$MILL_NAME))
#
# Unused4 <- TPOSampleJoined %>%
# filter(MILL_NAME == "ANTHONY & CO., INC.")
#
# Unused5 <- TPOSampleJoined %>%
# filter((Region == "MidAtl" & LOG_TOT_MCF < 1 & Pro_Rad >= 100))
#
# UnusedFinal <- Unused1 %>%
# bind_rows(Unused2, Unused5)
#
# rm(Unused1, Unused2, Unused5)
TPOSampleJoined <- TPOSampleJoined %>%
filter(!UnusedCheck %in% Unused1$UnusedCheck)
RegionCounts <- TPOSampleJoined %>%
filter(!is.na(Pro_Rad_SqRt)) %>%
dplyr::count(Region)
TPOSampleJoined %>%
filter(!is.na(Pro_Rad_SqRt) & !is.na(Region) & !is.na(LOG_TOT_MCF)) %>%
ggplot(aes(LOG_TOT_MCF)) +
geom_density() +
scale_x_continuous(limits = c(-2, 5), breaks = c(seq(-2,5,1))) +
scale_y_continuous(limits = c, breaks = c(seq(0,.7,.1))) +
theme_fivethirtyeight(base_size = 20, base_family = "serif") +
theme(axis.title = element_text(family = 'serif', size = 20)) +
labs(title = "Logged Volume Distribution", x= "Log(10) MCF",y = "Density")
TPOSampleJoined %>%
filter(!is.na(Pro_Rad_SqRt) & !is.na(Region) & !is.na(LOG_TOT_MCF)) %>%
ggplot(aes(Pro_Rad_SqRt)) +
geom_density() +
scale_x_continuous(limits = c(0, 15), breaks = c(seq(0,15,1))) +
scale_y_continuous(limits = c(0,.25), breaks = c(seq(0,.25,.05))) +
theme_fivethirtyeight(base_size = 20, base_family = "serif") +
theme(axis.title = element_text(family = 'serif', size = 20)) +
labs(title = "SqRt Radius Distribution", x= "SqRt Procurement Radius",y = "Density")
Call:
lm(formula = TPOSampleJoined$Pro_Rad_SqRt ~ TPOSampleJoined$LOG_TOT_MCF *
TPOSampleJoined$Region)
Residuals:
Min 1Q Median 3Q Max
-5.0677 -1.2180 -0.2464 1.2087 6.7138
Coefficients:
Estimate
(Intercept) 5.2810
TPOSampleJoined$LOG_TOT_MCF 0.9642
TPOSampleJoined$RegionLake 1.5126
TPOSampleJoined$RegionMidAtl 0.1150
TPOSampleJoined$RegionNewEngland -1.6127
TPOSampleJoined$RegionPlains -0.5243
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionLake -0.4986
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionMidAtl -0.3657
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionNewEngland 0.2796
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionPlains -0.1779
Std. Error
(Intercept) 0.5430
TPOSampleJoined$LOG_TOT_MCF 0.2145
TPOSampleJoined$RegionLake 0.7937
TPOSampleJoined$RegionMidAtl 0.7732
TPOSampleJoined$RegionNewEngland 0.7810
TPOSampleJoined$RegionPlains 0.7282
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionLake 0.3036
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionMidAtl 0.3005
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionNewEngland 0.2979
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionPlains 0.2914
t value
(Intercept) 9.725
TPOSampleJoined$LOG_TOT_MCF 4.494
TPOSampleJoined$RegionLake 1.906
TPOSampleJoined$RegionMidAtl 0.149
TPOSampleJoined$RegionNewEngland -2.065
TPOSampleJoined$RegionPlains -0.720
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionLake -1.642
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionMidAtl -1.217
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionNewEngland 0.939
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionPlains -0.610
Pr(>|t|)
(Intercept) < 0.0000000000000002
TPOSampleJoined$LOG_TOT_MCF 0.00000866
TPOSampleJoined$RegionLake 0.0572
TPOSampleJoined$RegionMidAtl 0.8819
TPOSampleJoined$RegionNewEngland 0.0394
TPOSampleJoined$RegionPlains 0.4718
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionLake 0.1012
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionMidAtl 0.2243
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionNewEngland 0.3484
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionPlains 0.5418
(Intercept) ***
TPOSampleJoined$LOG_TOT_MCF ***
TPOSampleJoined$RegionLake .
TPOSampleJoined$RegionMidAtl
TPOSampleJoined$RegionNewEngland *
TPOSampleJoined$RegionPlains
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionLake
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionMidAtl
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionNewEngland
TPOSampleJoined$LOG_TOT_MCF:TPOSampleJoined$RegionPlains
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.958 on 506 degrees of freedom
(2540 observations deleted due to missingness)
Multiple R-squared: 0.1988, Adjusted R-squared: 0.1845
F-statistic: 13.95 on 9 and 506 DF, p-value: < 0.00000000000000022
Test_Model <- lm(TPOSampleJoined$Pro_Rad_SqRt ~ TPOSampleJoined$LOG_TOT_MCF + TPOSampleJoined$MILL_STATE + 0)
# summary(Test_Model)
# Test_Res <- mean(abs(residuals(Test_Model)^2))
Test_NE <- TPOSampleJoined %>%
filter(Region == "NewEngland")
NE_Model = lm(Test_NE$Pro_Rad_SqRt ~ Test_NE$LOG_TOT_MCF)
summary(NE_Model)
Call:
lm(formula = Test_NE$Pro_Rad_SqRt ~ Test_NE$LOG_TOT_MCF)
Residuals:
Min 1Q Median 3Q Max
-4.6144 -1.2219 -0.3449 1.2442 6.7138
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.6683 0.5487 6.685 0.00000000297 ***
Test_NE$LOG_TOT_MCF 1.2438 0.2021 6.154 0.00000002935 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.914 on 79 degrees of freedom
(319 observations deleted due to missingness)
Multiple R-squared: 0.324, Adjusted R-squared: 0.3155
F-statistic: 37.87 on 1 and 79 DF, p-value: 0.00000002935
Call:
lm(formula = Test_MidAtl$Pro_Rad_SqRt ~ Test_MidAtl$LOG_TOT_MCF)
Residuals:
Min 1Q Median 3Q Max
-4.6992 -1.0704 -0.2317 0.9577 5.4396
Coefficients:
Estimate Std. Error t value
(Intercept) 5.3959 0.4904 11.004
Test_MidAtl$LOG_TOT_MCF 0.5985 0.1875 3.192
Pr(>|t|)
(Intercept) < 0.0000000000000002 ***
Test_MidAtl$LOG_TOT_MCF 0.00179 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.744 on 123 degrees of freedom
(499 observations deleted due to missingness)
Multiple R-squared: 0.07651, Adjusted R-squared: 0.069
F-statistic: 10.19 on 1 and 123 DF, p-value: 0.001792
Call:
lm(formula = Test_Central$Pro_Rad_SqRt ~ Test_Central$LOG_TOT_MCF)
Residuals:
Min 1Q Median 3Q Max
-5.0677 -1.4601 -0.1734 1.5167 5.0355
Coefficients:
Estimate Std. Error t value
(Intercept) 5.2810 0.5871 8.996
Test_Central$LOG_TOT_MCF 0.9642 0.2319 4.157
Pr(>|t|)
(Intercept) 0.00000000000000472 ***
Test_Central$LOG_TOT_MCF 0.00006132827517507 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.117 on 118 degrees of freedom
(524 observations deleted due to missingness)
Multiple R-squared: 0.1278, Adjusted R-squared: 0.1204
F-statistic: 17.28 on 1 and 118 DF, p-value: 0.00006133
Call:
lm(formula = Test_Lake$Pro_Rad_SqRt ~ Test_Lake$LOG_TOT_MCF)
Residuals:
Min 1Q Median 3Q Max
-4.3963 -1.1448 -0.3122 1.5763 5.6386
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.7935 0.5863 11.587 <0.0000000000000002
Test_Lake$LOG_TOT_MCF 0.4656 0.2176 2.139 0.0346
(Intercept) ***
Test_Lake$LOG_TOT_MCF *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.983 on 112 degrees of freedom
(712 observations deleted due to missingness)
Multiple R-squared: 0.03926, Adjusted R-squared: 0.03068
F-statistic: 4.576 on 1 and 112 DF, p-value: 0.03459
Call:
lm(formula = Test_Plains$Pro_Rad_SqRt ~ Test_Plains$LOG_TOT_MCF)
Residuals:
Min 1Q Median 3Q Max
-3.5742 -1.4026 -0.2425 1.0928 5.9987
Coefficients:
Estimate Std. Error t value
(Intercept) 4.7566 0.5046 9.426
Test_Plains$LOG_TOT_MCF 0.7863 0.2052 3.832
Pr(>|t|)
(Intercept) 0.0000000000000262 ***
Test_Plains$LOG_TOT_MCF 0.000264 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.037 on 74 degrees of freedom
(486 observations deleted due to missingness)
Multiple R-squared: 0.1656, Adjusted R-squared: 0.1543
F-statistic: 14.68 on 1 and 74 DF, p-value: 0.0002644
Intercepts <- c(summary(Central_Model)$coefficients[,4][1],
summary(Lake_Model)$coefficients[,4][1],
summary(MidAtl_Model)$coefficients[,4][1],
summary(NE_Model)$coefficients[,4][1],
summary(Plains_Model)$coefficients[,4][1])
Regions = c("Central", "Lake", "MidAtl", "NewEngland", "Plains")
PVals <- c(summary(Central_Model)$coefficients[,4][2],
summary(Lake_Model)$coefficients[,4][2],
summary(MidAtl_Model)$coefficients[,4][2],
summary(NE_Model)$coefficients[,4][2],
summary(Plains_Model)$coefficients[,4][2])
ModelStats <- data.frame(Region = Regions,Int = Intercepts, PVals = PVals)
# GAM <- gam(data = TPOSampleJoined, formula = Pro_Rad_SqRt ~ te(LOG_TOT_MCF, by = as.factor(Region), sp = .35, k = 3))
# summary(GAM)
# gam.check(GAM)
# plot.gam(GAM, residuals = TRUE, shade = TRUE, seWithMean = TRUE)
# GAM_Res <- residuals.gam(GAM)
# GAM_Res <- GAM_Res^2
# mean(GAM_Res)
# Central=function(x){coef(Best_Model)[2]*x+coef(Best_Model)[1]}
# Lake=function(x){coef(Best_Model)[2]*x+coef(Best_Model)[1]+coef(Best_Model)[3]}
# MI=function(x){coef(Best_Model)[2]*x+coef(Best_Model)[1]+coef(Best_Model)[4]}
# MidAtl=function(x){coef(Best_Model)[2]*x+coef(Best_Model)[1]+coef(Best_Model)[5]}
# NE=function(x){coef(Best_Model)[2]*x+coef(Best_Model)[1]+coef(Best_Model)[6]}
# PA=function(x){coef(Best_Model)[2]*x+coef(Best_Model)[1]+coef(Best_Model)[7]}
# Plains=function(x){coef(Best_Model)[2]*x+coef(Best_Model)[1]+coef(Best_Model)[8]}
#
# ggplot(TPOSampleJoined,aes(y=Pro_Rad_SqRt,x=LOG_TOT_MCF,color=Region))+
# geom_point()+
# stat_function(fun=Central,geom="line",color='yellow') +
# stat_function(fun=Lake,geom="line",color='blue') +
# stat_function(fun=MI,geom="line",color='red') +
# stat_function(fun=MidAtl,geom="line",color='black') +
# stat_function(fun=NE,geom="line",color='green') +
# stat_function(fun=PA,geom="line",color='orange') +
# stat_function(fun=Plains,geom="line",color='purple')
TPOSampleJoined <- TPOSampleJoined %>%
left_join(RegionCounts, by = "Region") %>%
left_join(ModelStats, by = "Region")
TPOSampleJoined %>%
filter(!is.na(Pro_Rad)) %>%
ggplot(aes(LOG_TOT_MCF, Pro_Rad_SqRt)) +
geom_point() +
geom_smooth(se = FALSE, method = "lm") +
scale_x_continuous(limits = c(-.5, 4.5), breaks = c(seq(-.5,4.5,.5))) +
scale_y_continuous(limits = c(0, 15), breaks = c(seq(0,14,2))) +
facet_wrap(~Region, nrow = 3) +
theme_fivethirtyeight(base_size = 20, base_family = "serif") +
theme(axis.title = element_text(family = 'serif', size = 20)) +
labs(title = "Linear Regression Test Plot", x = "Log(10) MCF",y = "Square Root of Procurement Radius") +
stat_regline_equation(label.x = -.5, label.y = 14, aes(label = ..rr.label..)) +
geom_text(x = -.25, y = 12, aes(label = paste0("n = ", n))) +
geom_text(x = -.2, y = 0, aes(label = paste0("P-Val = ", round(PVals, 8))))
ggplot(ModelStats, aes(Region, PVals)) +
geom_col() +
geom_text(aes(label = round(PVals, digits = 8)), size = 6 ,nudge_y = .001) +
scale_y_continuous(limits = c(0, .05), breaks = c(seq(0,.05,.01))) +
theme_fivethirtyeight(base_size = 20, base_family = "serif") +
theme(axis.title = element_text(family = 'serif', size = 20)) +
labs(title = "P-Values by Region", x = "Region", y = "P-Value")
VIFScores <- vif(Best_Model) %>%
as.data.frame(row.names = NULL)
VIFScores <- VIFScores %>%
dplyr::rename("VIF" = `GVIF^(1/(2*Df))`)
VIFs <- as.data.frame(VIFScores$VIF)
VIFs <- cbind(VIFs, Variable = c("Log(10) MCF", "Region", "Interaction"))
VIFs <- VIFs %>%
dplyr::rename('VIF' =`VIFScores$VIF`)
ggplot(VIFs, aes(Variable, VIF)) +
geom_col(position = "identity") +
geom_hline(yintercept = 4) +
geom_text(aes(label = round(VIF, digits = 2)), nudge_y = .12) +
scale_y_continuous(limits = c(0,5), breaks = c(seq(0,5,1))) +
theme_fivethirtyeight(base_size = 20, base_family = "serif") +
theme(axis.title = element_text(family = 'serif', size = 20)) +
labs(title = "Linear Regression Model - VIF Scores", y = "VIF Score", x = "Variable")
rm(VIFScores, VIFs)
Intercepts
(Intercept) (Intercept)
0.000000000000004719958607295 0.000000000000000000006974468
(Intercept) (Intercept)
0.000000000000000000050105485 0.000000002969597000132150000
(Intercept)
0.000000000000026187370596178
TPOSampleJoined %>%
filter(!is.na(Pro_Rad)) %>%
ggplot(aes(LOG_TOT_MCF, Pro_Rad_SqRt)) +
geom_point() +
geom_smooth(se = FALSE, method = "lm") +
# scale_x_continuous(limits = c(0, 16000), breaks = c(seq(0,16000,2000))) +
scale_y_continuous(limits = c(0, 14), breaks = c(seq(0,14,2))) +
facet_wrap(~MILL_STATE, scales = "free_x", nrow = 3) +
theme_fivethirtyeight(base_size = 20, base_family = "serif") +
theme(axis.title = element_text(family = 'serif', size = 20)) +
labs(title = "Linear Regression Test Plot", x = "MCF",y = "Procurement Radius") +
stat_regline_equation(label.x = 1, label.y = 14, aes(label = ..rr.label..))
VIFScores <- vif(Test_Model) %>%
as.data.frame(row.names = NULL)
VIFScores <- VIFScores %>%
dplyr::rename("VIF" = `GVIF^(1/(2*Df))`)
VIFs <- as.data.frame(VIFScores$VIF)
VIFs <- cbind(VIFs, Variable = c("Log(10) MCF", "State"))
VIFs <- VIFs %>%
dplyr::rename('VIF' =`VIFScores$VIF`)
ggplot(VIFs, aes(Variable, VIF)) +
geom_col(position = "identity") +
geom_hline(yintercept = 4) +
geom_text(aes(label = round(VIF, digits = 2)), nudge_y = .12) +
scale_y_continuous(limits = c(0,5), breaks = c(seq(0,5,1))) +
theme_fivethirtyeight(base_size = 20, base_family = "serif") +
theme(axis.title = element_text(family = 'serif', size = 20)) +
labs(title = "Linear Regression Model - VIF Scores", y = "GVIF Value", x = "Variable")
rm(VIFScores, VIFs)
# TPOSampleJoined <- TPOSampleJoined %>%
# select(-c(BestPredictions, BestResiduals))
TPOSampleJoined_Chi <- TPOSampleJoined %>%
filter(!is.na(Pro_Rad_SqRt) & !is.na(MILL_STATE) & !is.na(LOG_TOT_MCF))
# Chi Tests
Chi_Rad_Vol <- chisq.test(table(TPOSampleJoined_Chi$Pro_Rad_SqRt, TPOSampleJoined_Chi$LOG_TOT_MCF))
#Chi_Rad_State <- chisq.test(table(TPOSampleJoined_Chi$Pro_Rad_SqRt, TPOSampleJoined_Chi$MILL_STATE))
#Chi_Rad_Volt <- chisq.test(table(TPOSampleJoined_Chi$Pro_Rad_SqRt, TPOSampleJoined_Chi$volt))
Chi_Rad_Region <- chisq.test(table(TPOSampleJoined_Chi$Pro_Rad_SqRt, TPOSampleJoined_Chi$Region))
ChiStats <- c(Chi_Rad_Vol$statistic, Chi_Rad_Region$statistic)
ChiParams <- c(Chi_Rad_Vol$parameter, Chi_Rad_Region$parameter)
ChiPVals <- c(Chi_Rad_Vol$p.value, Chi_Rad_Region$p.value)
ChiJoined <- as.data.frame(cbind(Stats = ChiStats, Parameters = ChiParams, PValues = ChiPVals), row.names = c("Log(10) MCF", "Region")) %>%
rownames_to_column(var = "Variable")
ggplot(ChiJoined, aes(PValues, Variable)) +
geom_col(position = "identity") +
geom_hline(yintercept = .05) +
geom_text(aes(label = round(PValues, digits = 10)), nudge_x = .005) +
scale_x_continuous(limits = c(0,.2), breaks = c(seq(0,.2,.02))) +
theme_fivethirtyeight(base_size = 20, base_family = "serif") +
theme(axis.title = element_text(family = 'serif', size = 20)) +
labs(title = "P-Value of Model Variables in Relation to SqRt Procurement Radius", y = "Variable", x = "P-Value") +
coord_flip()
Chi_Rad_Vol
Pearson's Chi-squared test
data: table(TPOSampleJoined_Chi$Pro_Rad_SqRt, TPOSampleJoined_Chi$LOG_TOT_MCF)
X-squared = 13873, df = 13542, p-value = 0.02265
[1] "Correlation: 0.1995"
Chi_Rad_Region
Pearson's Chi-squared test
data: table(TPOSampleJoined_Chi$Pro_Rad_SqRt, TPOSampleJoined_Chi$Region)
X-squared = 168.51, df = 148, p-value = 0.1191
ggplot(Residuals, aes(Regions, Mean_Residuals)) +
geom_col(position = "identity") +
geom_text(aes(label = round(Mean_Residuals, digits = 1)), nudge_y = .5) +
scale_y_continuous(limits = c(0,30), breaks = c(seq(0,30,5))) +
theme_fivethirtyeight(base_size = 20, base_family = "serif") +
theme(axis.title = element_text(family = 'serif', size = 20)) +
labs(title = "Mean Residuals by Region", y = "Mean Residuals (Unstransformed Radius)", x = "Region")