library(readxl)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.2
## Warning: package 'ggplot2' was built under R version 4.4.2
## Warning: package 'tidyr' was built under R version 4.4.2
## Warning: package 'readr' was built under R version 4.4.2
## Warning: package 'purrr' was built under R version 4.4.2
## Warning: package 'forcats' was built under R version 4.4.2
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Warning: package 'car' was built under R version 4.4.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.2
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.2
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

Loading the Data

data <- read_excel("QUANT_Merge.xlsx")

Starting with the statistical analysis and box plots before moving to the regression.

There are issues with the summary below:

  1. The Yes/No and Type summaries to not provide useful information.

  2. All other summaries are for the dataset as a whole instead of each yes/no category, so there is functionally no useful information/insight at all.

I will split the data set into three sheets (the current one, one with only project streets, and one with non-project streets), then do these again.

summary(data)
##   SegmentCost       CostPerFoot           PCI            Calls      
##  Min.   :      0   Min.   :   0.00   Min.   : 0.00   Min.   :0.000  
##  1st Qu.:      0   1st Qu.:   0.00   1st Qu.:61.90   1st Qu.:1.000  
##  Median :      0   Median :   0.00   Median :79.47   Median :1.000  
##  Mean   :  11930   Mean   :  24.10   Mean   :73.75   Mean   :1.578  
##  3rd Qu.:   5605   3rd Qu.:  10.38   3rd Qu.:90.02   3rd Qu.:2.000  
##  Max.   :1049145   Max.   :1096.19   Max.   :99.99   Max.   :9.000  
##                                                                     
##  TimeProximity    ProjectYesNo       Application            Type          
##  Min.   :  0.00   Length:47295       Length:47295       Length:47295      
##  1st Qu.: 20.00   Class :character   Class :character   Class :character  
##  Median : 60.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 50.09                                                           
##  3rd Qu.: 80.00                                                           
##  Max.   :100.00                                                           
##  NA's   :24736                                                            
##  SegmentLength       VOIDProjectCost      Fiscal Year    VOIDProjectLength 
##  Min.   :    1.701   Min.   :    139.2   Min.   :2023    Min.   :   63.58  
##  1st Qu.:  271.197   1st Qu.:   8249.6   1st Qu.:2024    1st Qu.: 1067.50  
##  Median :  356.750   Median :  29965.3   Median :2025    Median : 1951.63  
##  Mean   :  492.807   Mean   : 129272.0   Mean   :2025    Mean   : 2707.60  
##  3rd Qu.:  610.313   3rd Qu.: 141707.3   3rd Qu.:2027    3rd Qu.: 3318.70  
##  Max.   :11656.087   Max.   :2428774.3   Max.   :2028    Max.   :34891.59  
##                      NA's   :24736       NA's   :24736   NA's   :24736     
##     Status          PavementWidth  
##  Length:47295       Min.   : 8.00  
##  Class :character   1st Qu.:26.00  
##  Mode  :character   Median :28.50  
##                     Mean   :30.67  
##                     3rd Qu.:30.00  
##                     Max.   :96.00  
## 
Project <- read_excel("ProjectStreets.xlsx")
NOProject <- read_excel("NoProjects.xlsx")

Doing summaries on the split sets

summary(Project)
##   SegmentCost         CostPerFoot             PCI            Calls     
##  Min.   :     72.3   Min.   :   0.1748   Min.   : 0.00   Min.   :1.00  
##  1st Qu.:   1984.2   1st Qu.:   5.2415   1st Qu.:70.14   1st Qu.:2.00  
##  Median :   6300.1   Median :  16.9329   Median :86.38   Median :2.00  
##  Mean   :  25010.6   Mean   :  50.5211   Mean   :79.24   Mean   :2.39  
##  3rd Qu.:  26367.3   3rd Qu.:  60.2667   3rd Qu.:94.00   3rd Qu.:3.00  
##  Max.   :1049144.6   Max.   :1096.1860   Max.   :99.99   Max.   :9.00  
##  TimeProximity    ProjectYesNo       Application            Type          
##  Min.   :  0.00   Length:22559       Length:22559       Length:22559      
##  1st Qu.: 20.00   Class :character   Class :character   Class :character  
##  Median : 60.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 50.08                                                           
##  3rd Qu.: 80.00                                                           
##  Max.   :100.00                                                           
##  SegmentLength    VOIDProjectCost      Fiscal Year   VOIDProjectLength 
##  Min.   :  18.6   Min.   :    139.2   Min.   :2023   Min.   :   63.58  
##  1st Qu.: 281.8   1st Qu.:   8249.6   1st Qu.:2024   1st Qu.: 1067.50  
##  Median : 362.8   Median :  29965.3   Median :2025   Median : 1951.63  
##  Mean   : 506.7   Mean   : 129272.0   Mean   :2025   Mean   : 2707.60  
##  3rd Qu.: 641.8   3rd Qu.: 141707.3   3rd Qu.:2027   3rd Qu.: 3318.70  
##  Max.   :9137.3   Max.   :2428774.3   Max.   :2028   Max.   :34891.59  
##     Status          PavementWidth  
##  Length:22559       Min.   :10.00  
##  Class :character   1st Qu.:26.00  
##  Mode  :character   Median :28.50  
##                     Mean   :30.43  
##                     3rd Qu.:30.00  
##                     Max.   :90.00
summary(NOProject)
##   SegmentCost  CostPerFoot      PCI            Calls        TimeProximity 
##  Min.   :0    Min.   :0    Min.   : 0.00   Min.   :0.0000   Mode:logical  
##  1st Qu.:0    1st Qu.:0    1st Qu.:55.80   1st Qu.:0.0000   NA's:24736    
##  Median :0    Median :0    Median :73.40   Median :1.0000                 
##  Mean   :0    Mean   :0    Mean   :68.75   Mean   :0.8365                 
##  3rd Qu.:0    3rd Qu.:0    3rd Qu.:86.29   3rd Qu.:1.0000                 
##  Max.   :0    Max.   :0    Max.   :99.69   Max.   :7.0000                 
##  ProjectYesNo       Application      Type         SegmentLength      
##  Length:24736       Mode:logical   Mode:logical   Min.   :    1.701  
##  Class :character   NA's:24736     NA's:24736     1st Qu.:  256.509  
##  Mode  :character                                 Median :  350.751  
##                                                   Mean   :  480.138  
##                                                   3rd Qu.:  578.935  
##                                                   Max.   :11656.087  
##  VOIDProjectCost Fiscal Year    VOIDProjectLength    Status         
##  Mode:logical    Mode:logical   Mode:logical      Length:24736      
##  NA's:24736      NA's:24736     NA's:24736        Class :character  
##                                                   Mode  :character  
##                                                                     
##                                                                     
##                                                                     
##  PavementWidth  
##  Min.   : 8.00  
##  1st Qu.:26.00  
##  Median :28.50  
##  Mean   :30.88  
##  3rd Qu.:32.00  
##  Max.   :96.00

Loading the boxplot, with the “yes” project category split between preservation and rehabilitation

This displayed interesting results, but, as expected, leave some questions that may be better answered by consolidating all project streets and contrasting with non-project streets as one unified category.

Some observations:

  1. Preservation projects were generally reserved for higher PCI streets than rehabilitation projects. This did not come as a surprise, as high PCI streets would less likely warrant intensive repairs.

  2. The streets not selected for a project tend to have a higher PCI than rehabilitation streets, but only moderately so.

  3. The non-project streets tend to have a lower PCI than streets selected for preservation. Being a dramatically-larger portion of the dataset, non-project streets, there may be more to look at.

ggplot(data,aes(x=Type,y=PCI)) + geom_boxplot() + labs(title="PCI by Project Type",subtitle="From the full dataset")

Loading the boxplot, with yes and no as the categories.

Where the first boxplot confirmed some simple assumptions, such as preservation streets going to higher-PCI streets than rehabilitation streets, this one gave a clearer picture of an issue warranting future study. The bulk of streets selected for projects tend to have a higher PCI than those not selected.

I may consider testing this against a the further-out fiscal years to confirm these findings, just in case the close time proximity between PCI evaluations and the earlier fiscal years led to any ratings being made post-project.

ggplot(data,aes(x=ProjectYesNo,y=PCI)) + geom_boxplot() + labs(title="PCI by Project (Yes/No)",subtitle="From the full dataset")

Created a dataframe with FY2023 and 2024 removed, so there it is clear beyond any reasonable doubt that the PCI measurements were taken before any of the projects associated with them.

filtered_data <- data[!(data$`Fiscal Year` %in% c(2023, 2024)), ]

In principal, the filtered data yielded the same results: Streets not selected for projects tended to have a lower PCI than those chosen for projects.

ggplot(filtered_data,aes(x=ProjectYesNo,y=PCI)) + geom_boxplot() + labs(title="PCI by Project (Yes/No) - FY23 AND 24 REMOVED",subtitle="From the full dataset WITH FY23 AND 24 REMOVED")

Now, I will run boxplots of the call volumes. I will not be using the filtered dataset moving forward, as it was only an exercise to confirm the validity of the full dataset.

I was confused about some of the boxes abruptly stopping after the mean value, but I understand now that calls above the median are so rare and disproportionately high, that they are outside of the percentile range the box includes.

This has interesting results that feel very intuitive. Streets with no project get lower calls than those with projects. When broken down, streets with rehabilitation projects have a similar volume as preservation projects, albiet with more variety, but both still independently have higher volumes than non-project streets.

ggplot(data,aes(x=Type,y=Calls)) + geom_boxplot() + labs(title="Calls by Project Type",subtitle="From the full dataset")

ggplot(data,aes(x=ProjectYesNo,y=Calls)) + geom_boxplot() + labs(title="Calls by Project (Yes/No)",subtitle="From the full dataset")

The second method will specifically look at segments with a project associated with them. This analysis, having more options for continuous variables, will go through a regression, with Cost per Linear Foot representing the dependent variable. The paper will look at the relationship between the dependent variable and the PCI, the year the project is scheduled, and the total cost of the project. The fiscal years will be expressed by a “timeliness score” in increments of 20, from 0 representing FY 2028 to 100 representing FY 2023. Although Cost per Linear Foot and Total Project Cost are tied to each other, this paper will test if Public Works is averse to projects with large price tags, as they would displace a larger number of projects that can serve multiple constituencies around the city instead of only the users of that single road, or if unit cost drives their decisions.

ggplot(Project,aes(x=PCI,y=CostPerFoot)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Did KS test due to high sample size.

ks.test(Project$CostPerFoot,"pnorm")
## Warning in ks.test.default(Project$CostPerFoot, "pnorm"): ties should not be
## present for the one-sample Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  Project$CostPerFoot
## D = 0.98445, p-value < 2.2e-16
## alternative hypothesis: two-sided
ProjectSQRT<-Project %>% mutate(CostPerFootSQRT=sqrt(CostPerFoot))

ks.test(ProjectSQRT$CostPerFootSQRT,"pnorm")
## Warning in ks.test.default(ProjectSQRT$CostPerFootSQRT, "pnorm"): ties should
## not be present for the one-sample Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  ProjectSQRT$CostPerFootSQRT
## D = 0.93361, p-value < 2.2e-16
## alternative hypothesis: two-sided
ggplot(ProjectSQRT,aes(x=PCI,y=CostPerFootSQRT)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

hist(Project$CostPerFoot)

hist(ProjectSQRT$CostPerFootSQRT)

I don’t know if the square root transformation made much of a difference from the scatterplot, but the histogram seems to show an improvement. I will make models of both, look at the summaries, and do VIF tests.

#A model without the square root
CostPerFoot_lm<-lm(CostPerFoot~PCI+Calls+TimeProximity+SegmentCost,data=Project)
summary(CostPerFoot_lm)
## 
## Call:
## lm(formula = CostPerFoot ~ PCI + Calls + TimeProximity + SegmentCost, 
##     data = Project)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -794.67  -19.51   -8.80    4.34  374.19 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.842e+01  1.606e+00  61.293  < 2e-16 ***
## PCI           -9.330e-01  1.793e-02 -52.034  < 2e-16 ***
## Calls         -1.925e+00  2.954e-01  -6.515 7.41e-11 ***
## TimeProximity  1.230e-01  1.001e-02  12.293  < 2e-16 ***
## SegmentCost    9.788e-04  6.893e-06 141.995  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48.44 on 22554 degrees of freedom
## Multiple R-squared:  0.596,  Adjusted R-squared:  0.5959 
## F-statistic:  8318 on 4 and 22554 DF,  p-value: < 2.2e-16
vif(CostPerFoot_lm)
##           PCI         Calls TimeProximity   SegmentCost 
##      1.223011      1.016427      1.053517      1.162139
durbinWatsonTest(CostPerFoot_lm)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.1769242       1.64602       0
##  Alternative hypothesis: rho != 0
#A model with the square root
CostPerFootSQRT_lm<-lm(CostPerFootSQRT~PCI+Calls+TimeProximity+SegmentCost,data=ProjectSQRT)
summary(CostPerFootSQRT_lm)
## 
## Call:
## lm(formula = CostPerFootSQRT ~ PCI + Calls + TimeProximity + 
##     SegmentCost, data = ProjectSQRT)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.034  -1.621  -0.725   1.262  14.815 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.002e+01  9.269e-02 108.154  < 2e-16 ***
## PCI           -6.860e-02  1.035e-03 -66.277  < 2e-16 ***
## Calls         -1.955e-01  1.705e-02 -11.464  < 2e-16 ***
## TimeProximity  4.586e-03  5.777e-04   7.939 2.13e-15 ***
## SegmentCost    5.080e-05  3.979e-07 127.671  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.796 on 22554 degrees of freedom
## Multiple R-squared:  0.587,  Adjusted R-squared:  0.5869 
## F-statistic:  8013 on 4 and 22554 DF,  p-value: < 2.2e-16
vif(CostPerFootSQRT_lm)
##           PCI         Calls TimeProximity   SegmentCost 
##      1.223011      1.016427      1.053517      1.162139
durbinWatsonTest(CostPerFootSQRT_lm)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.3794984      1.240957       0
##  Alternative hypothesis: rho != 0

Doing a Cor Test on Cost Per Foot and Project Cost

cor.test(Project$CostPerFoot,Project$SegmentCost)
## 
##  Pearson's product-moment correlation
## 
## data:  Project$CostPerFoot and Project$SegmentCost
## t = 164.48, df = 22557, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7324718 0.7443393
## sample estimates:
##       cor 
## 0.7384627
cor.test(ProjectSQRT$CostPerFootSQRT,ProjectSQRT$SegmentCost)
## 
##  Pearson's product-moment correlation
## 
## data:  ProjectSQRT$CostPerFootSQRT and ProjectSQRT$SegmentCost
## t = 150.2, df = 22557, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7005572 0.7136064
## sample estimates:
##      cor 
## 0.707142

Plotting the relationship between Cost Per Foot and Project Cost for a visual understanding.

This might be interesting. If I understand this, it seems like, at least when taking everything in at once without distinguishing project types, unit price does not get cheaper as the total project cost goes up. That seems to heavily discourage extensive or large-scale projects.

ggplot(Project,aes(x=CostPerFoot,y=SegmentCost)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

I am going to make a model without total cost to see what happens.

I ended up throwing in the originals too, so I could see the difference more easily.

I will post my observations under their corresponding formulas with the pound sign.

#A model without the square root
CostPerFoot_lm<-lm(CostPerFoot~PCI+Calls+TimeProximity+SegmentCost,data=Project)
summary(CostPerFoot_lm)
## 
## Call:
## lm(formula = CostPerFoot ~ PCI + Calls + TimeProximity + SegmentCost, 
##     data = Project)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -794.67  -19.51   -8.80    4.34  374.19 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.842e+01  1.606e+00  61.293  < 2e-16 ***
## PCI           -9.330e-01  1.793e-02 -52.034  < 2e-16 ***
## Calls         -1.925e+00  2.954e-01  -6.515 7.41e-11 ***
## TimeProximity  1.230e-01  1.001e-02  12.293  < 2e-16 ***
## SegmentCost    9.788e-04  6.893e-06 141.995  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48.44 on 22554 degrees of freedom
## Multiple R-squared:  0.596,  Adjusted R-squared:  0.5959 
## F-statistic:  8318 on 4 and 22554 DF,  p-value: < 2.2e-16
vif(CostPerFoot_lm)
##           PCI         Calls TimeProximity   SegmentCost 
##      1.223011      1.016427      1.053517      1.162139
durbinWatsonTest(CostPerFoot_lm)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.1769242       1.64602       0
##  Alternative hypothesis: rho != 0
#Refer to the next model notes below

#NO COST NORMAL
NOCOSTCostPerFoot_lm<-lm(CostPerFoot~PCI+Calls+TimeProximity,data=Project)
summary(NOCOSTCostPerFoot_lm)
## 
## Call:
## lm(formula = CostPerFoot ~ PCI + Calls + TimeProximity, data = Project)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -191.24  -33.60  -14.62    7.63 1071.35 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   195.92987    1.99741  98.092   <2e-16 ***
## PCI            -1.86216    0.02297 -81.053   <2e-16 ***
## Calls          -3.92738    0.40608  -9.671   <2e-16 ***
## TimeProximity   0.23050    0.01373  16.785   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 66.66 on 22555 degrees of freedom
## Multiple R-squared:  0.2348, Adjusted R-squared:  0.2347 
## F-statistic:  2307 on 3 and 22555 DF,  p-value: < 2.2e-16
vif(NOCOSTCostPerFoot_lm)
##           PCI         Calls TimeProximity 
##       1.06014       1.01411       1.04749
durbinWatsonTest(NOCOSTCostPerFoot_lm)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.3351652      1.329635       0
##  Alternative hypothesis: rho != 0
# No matter what, all values have extreme significance. 1)PCI has much lower estimate, 2) calls has higher estimate and rises to meet the significance of the others (even though it was already high), 3) time proximity has lower estimate
#Collinearity tests are functionally the same

#A model with the square root
CostPerFootSQRT_lm<-lm(CostPerFootSQRT~PCI+Calls+TimeProximity+SegmentCost,data=ProjectSQRT)
summary(CostPerFootSQRT_lm)
## 
## Call:
## lm(formula = CostPerFootSQRT ~ PCI + Calls + TimeProximity + 
##     SegmentCost, data = ProjectSQRT)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.034  -1.621  -0.725   1.262  14.815 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.002e+01  9.269e-02 108.154  < 2e-16 ***
## PCI           -6.860e-02  1.035e-03 -66.277  < 2e-16 ***
## Calls         -1.955e-01  1.705e-02 -11.464  < 2e-16 ***
## TimeProximity  4.586e-03  5.777e-04   7.939 2.13e-15 ***
## SegmentCost    5.080e-05  3.979e-07 127.671  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.796 on 22554 degrees of freedom
## Multiple R-squared:  0.587,  Adjusted R-squared:  0.5869 
## F-statistic:  8013 on 4 and 22554 DF,  p-value: < 2.2e-16
vif(CostPerFootSQRT_lm)
##           PCI         Calls TimeProximity   SegmentCost 
##      1.223011      1.016427      1.053517      1.162139
durbinWatsonTest(CostPerFootSQRT_lm)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.3794984      1.240957       0
##  Alternative hypothesis: rho != 0
#See next model notes

#NO COST SQRT
NOCOSTCostPerFootSQRT_lm<-lm(CostPerFootSQRT~PCI+Calls+TimeProximity,data=ProjectSQRT)
summary(NOCOSTCostPerFootSQRT_lm)
## 
## Call:
## lm(formula = CostPerFootSQRT ~ PCI + Calls + TimeProximity, data = ProjectSQRT)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.4094  -2.2656  -0.9285   1.5169  29.1533 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   15.085552   0.109963  137.19   <2e-16 ***
## PCI           -0.116824   0.001265  -92.36   <2e-16 ***
## Calls         -0.299431   0.022356  -13.39   <2e-16 ***
## TimeProximity  0.010164   0.000756   13.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.67 on 22555 degrees of freedom
## Multiple R-squared:  0.2885, Adjusted R-squared:  0.2884 
## F-statistic:  3048 on 3 and 22555 DF,  p-value: < 2.2e-16
vif(NOCOSTCostPerFootSQRT_lm)
##           PCI         Calls TimeProximity 
##       1.06014       1.01411       1.04749
durbinWatsonTest(NOCOSTCostPerFootSQRT_lm)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.5147075     0.9705726       0
##  Alternative hypothesis: rho != 0
#From what I can see, it seems like this one follows the same pattern, but with blunted results.

Given the results of these models, and the original comparisons between the un-transformed variable and the SQRT version, I’m not seeing a very compelling reason to use the SQRT version. The actual visual inspection looked more aesthetically pleasing, but the actual tests did not feel like a groundbreaking difference. The significance of all exercises still end up very high, so it seems that the un-transformed variable is still appropriate to use.