Project Purpose, Contents, and Data Sources:


Purpose


Contents:


The Data:


Data Collection and Preparation:


DI_df <- read.csv("Disease_Indicators_data.csv")

DI_df2<- DI_df%>%dplyr::select("County", "Percent.of.population.aged.18.34", "Percent.of.population.65.or.older", "Number.of.active.physicians","Number.of.hospital.beds", "Total.serious.crimes" , "Percent.high.school.graduates", "Percent.bachelor.s.degrees", "Percent.below.poverty.level" , "Percent.unemployment",  "Total.personal.income", "Geographic.region")

colnames(DI_df2) <- c("county", "Percent_population_aged_18_34","Percent_population_65_older","Number_active._physicians","Number_hospital_beds", "Total_serious_crimes" , "Percent_highschool_graduates","Percent_bachelor_degrees","Percent_below_poverty_level" , "Percent_unemployment",  "Total_personal_income", "Geographic_region")

DI_df2$NE <- I(DI_df2$Geographic_region=="1")*1
DI_df2$MW <- I(DI_df2$Geographic_region=="2")*1
DI_df2$STH <- I(DI_df2$Geographic_region=="3")*1
DI_df2$WST <- I(DI_df2$Geographic_region=="4")*1

# Remove the '_' character from the county names in the  'county' column :
DI_df2$county<-gsub("_"," ",as.character(DI_df2$county))


covid_df <- read.csv("covid_data.csv")

# select columns
covid_A<- covid_df%>%dplyr::select( "county", "state" ,  "fips" ,"cases"  ,  "deaths", "stay_at_home_announced", "stay_at_home_effective" ,  "total_population","area_sqmi" ,"population_density_per_sqmi","num_deaths", "years_of_potential_life_lost_rate", "percent_smokers" ,"percent_adults_with_obesity" ,"food_environment_index", "income_ratio" , "percent_physically_i0ctive" , "percent_uninsured" ,"num_primary_care_physicians" )


# Group by county and state - with MAX COVID CASES AND COVID DEATHS
covid_B <- covid_A%>%group_by(county, state, fips, total_population, population_density_per_sqmi,  years_of_potential_life_lost_rate,percent_smokers,percent_adults_with_obesity,
              food_environment_index,income_ratio,percent_physically_i0ctive,percent_uninsured,
              num_primary_care_physicians)%>%summarise(total_covid_cases = max(cases), total_covid_deaths = max(deaths))

colnames(covid_B)[11] <- "Percent_physically_inactive"

covid_B[is.na(covid_B)] = 0 # Replace NA with 0


Final Data Frame

  • Because of unmatched counties, there are some NA values from joining the two data frames.

  • Columns with NA values:

    • years_of_potential_life_lost_rate
    • food_environment_index
    • income_ratio
    • num_primary_care_physicians


  • Replaced NA values with 0 instead of removing the rows.


# Join the two data frames by County since what have in common
df2 <- left_join(covid_B, DI_df2, by = c("county"))   # Using Max Covid cases and Covid deaths

# Using Max deaths and Max Covid Cases
df2 <- df2%>%drop_na(Percent_population_aged_18_34,  Percent_population_65_older, Number_active._physicians, Number_hospital_beds ,Total_serious_crimes, Percent_highschool_graduates ,Percent_bachelor_degrees ,Percent_below_poverty_level, Percent_unemployment,Total_personal_income)

df2$total_covid_deaths <- replace(df2$total_covid_deaths, df2$total_covid_deaths == 0, 0.01)


kbl(head(df2)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
county state fips total_population population_density_per_sqmi years_of_potential_life_lost_rate percent_smokers percent_adults_with_obesity food_environment_index income_ratio Percent_physically_inactive percent_uninsured num_primary_care_physicians total_covid_cases total_covid_deaths Percent_population_aged_18_34 Percent_population_65_older Number_active._physicians Number_hospital_beds Total_serious_crimes Percent_highschool_graduates Percent_bachelor_degrees Percent_below_poverty_level Percent_unemployment Total_personal_income Geographic_region NE MW STH WST
Ada Idaho 16001 425798 404.551458 5088.378 11.99070 25.6 8.1 4.478032 14.9 8.743292 428 18698 191 27.6 10.4 367 557 9701 87.2 24.9 6.2 4.1 3866 4 0 0 0 1
Adams Colorado 8001 479977 411.188748 6436.698 16.32461 27.8 8.7 3.725936 19.9 11.026551 219 17503 270 29.6 7.6 439 318 19369 78.8 13.0 8.8 5.0 4271 4 0 0 0 1
Adams Idaho 16003 3865 2.836063 0.000 14.35193 30.5 7.6 3.681277 21.2 14.383095 1 80 2 29.6 7.6 439 318 19369 78.8 13.0 8.8 5.0 4271 4 0 0 0 1
Adams Illinois 17001 66949 78.284133 7087.094 15.45719 36.6 8.3 4.225536 27.6 5.236587 59 2726 30 29.6 7.6 439 318 19369 78.8 13.0 8.8 5.0 4271 4 0 0 0 1
Adams Indiana 18001 34813 102.684660 7299.512 20.45444 34.8 8.1 3.819934 30.4 11.623897 13 823 11 29.6 7.6 439 318 19369 78.8 13.0 8.8 5.0 4271 4 0 0 0 1
Adams Iowa 19003 3822 9.026107 0.000 15.63671 29.9 8.7 3.532057 24.7 6.194375 1 93 1 29.6 7.6 439 318 19369 78.8 13.0 8.8 5.0 4271 4 0 0 0 1



Exploratory Data Analysis


Density Plots


  • The density plot shows the distribution of total covid deaths at the county level
  • In the states selected, total population for each county and total deaths from Covid are not different. This could suggest that counties with high total population will have high total Covid deaths.




Histogram of Total Covid Deaths:
  • Total covid deaths is not very normally distributed. It is slightly left skewed. This may affect the distribution of errors as well.

  • Total Covid Cases is more normally distributed.



Scatter Plots:


  • There does not seem to be an obvious linear relationship between total covid deaths and many of the predictors.

  • There is a funnel shape to the observations with the predictors total serious crimes, number of active physicians, and total population.




Split Data: Test and Train Split


set.seed(123)
n <- dim(df2)[1]

new_df <- df2[, -c(1, 2, 3, 26)]
indices <- sample(1:n, 0.7*n)
train_df <- new_df[indices, ]
test_df <- new_df[-indices,]



First Order Multiple Linear Regression




options("scipen"=10)
multi_reg_full <- lm(total_covid_deaths ~. , data = train_df )
summary(multi_reg_full)
## 
## Call:
## lm(formula = total_covid_deaths ~ ., data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -825.98  -42.41    0.72   37.84 1797.47 
## 
## Coefficients: (1 not defined because of singularities)
##                                        Estimate    Std. Error t value
## (Intercept)                       -358.34061952  145.03375259  -2.471
## total_population                     0.00021827    0.00006229   3.504
## population_density_per_sqmi          0.08409557    0.00708149  11.875
## years_of_potential_life_lost_rate   -0.00114883    0.00228592  -0.503
## percent_smokers                     -5.65515866    1.90955968  -2.961
## percent_adults_with_obesity         -0.45283990    1.05163727  -0.431
## food_environment_index              15.96423670    4.69689889   3.399
## income_ratio                        25.04675583    7.45675267   3.359
## Percent_physically_inactive          6.83084585    1.07734580   6.340
## percent_uninsured                   -0.40855468    1.05218049  -0.388
## num_primary_care_physicians          0.12052934    0.05070709   2.377
## total_covid_cases                    0.01053600    0.00114233   9.223
## Percent_population_aged_18_34       -1.75476783    1.43751189  -1.221
## Percent_population_65_older          1.37250226    1.51725274   0.905
## Number_active._physicians           -0.00663949    0.01206788  -0.550
## Number_hospital_beds                -0.00511980    0.00755801  -0.677
## Total_serious_crimes                -0.00083479    0.00023257  -3.589
## Percent_highschool_graduates        -1.17388537    1.39158036  -0.844
## Percent_bachelor_degrees             2.24522363    1.11691741   2.010
## Percent_below_poverty_level          4.34995689    1.78453263   2.438
## Percent_unemployment                 8.67971717    2.78717510   3.114
## Total_personal_income                0.00635333    0.00169736   3.743
## NE                                  58.34787575   17.67040485   3.302
## MW                                  32.00757564   15.31492160   2.090
## STH                                 12.54058136   16.11689652   0.778
## WST                                          NA            NA      NA
##                                         Pr(>|t|)    
## (Intercept)                             0.013610 *  
## total_population                        0.000474 ***
## population_density_per_sqmi              < 2e-16 ***
## years_of_potential_life_lost_rate       0.615352    
## percent_smokers                         0.003117 ** 
## percent_adults_with_obesity             0.666827    
## food_environment_index                  0.000697 ***
## income_ratio                            0.000805 ***
## Percent_physically_inactive       0.000000000314 ***
## percent_uninsured                       0.697862    
## num_primary_care_physicians             0.017598 *  
## total_covid_cases                        < 2e-16 ***
## Percent_population_aged_18_34           0.222420    
## Percent_population_65_older             0.365845    
## Number_active._physicians               0.582290    
## Number_hospital_beds                    0.498271    
## Total_serious_crimes                    0.000344 ***
## Percent_highschool_graduates            0.399068    
## Percent_bachelor_degrees                0.044615 *  
## Percent_below_poverty_level             0.014918 *  
## Percent_unemployment                    0.001885 ** 
## Total_personal_income                   0.000190 ***
## NE                                      0.000986 ***
## MW                                      0.036814 *  
## STH                                     0.436649    
## WST                                           NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 145.9 on 1312 degrees of freedom
## Multiple R-squared:  0.8437, Adjusted R-squared:  0.8408 
## F-statistic: 295.1 on 24 and 1312 DF,  p-value: < 2.2e-16



Study of Residuals and Other Diagnostics for First Order Full Model



Plots: First Order Full Model

  • Time plot of residuals:

    • There is no obvious pattern in the time plot.



  • Additional Plots to check regression assumptions:

    • Funnel shape for residuals. They are not constant.
    • Some outliers are present as seen in Residual-Leverage plot:
      • observations: 149, 1255, 594





ANOVA Table

  • Analysis of the Variance for each predictor shows that the following features have the highest F-score in this model:

    • Total population
    • population density per sqr mile
    • Total covid cases
    • percent physical inactivity


options("scipen"=10)
anova(multi_reg_full)
## Analysis of Variance Table
## 
## Response: total_covid_deaths
##                                     Df    Sum Sq   Mean Sq   F value    Pr(>F)
## total_population                     1 138344239 138344239 6501.0112 < 2.2e-16
## population_density_per_sqmi          1   6506461   6506461  305.7487 < 2.2e-16
## years_of_potential_life_lost_rate    1     30421     30421    1.4295  0.232061
## percent_smokers                      1      7846      7846    0.3687  0.543824
## percent_adults_with_obesity          1    140665    140665    6.6101  0.010250
## food_environment_index               1     48849     48849    2.2955  0.129990
## income_ratio                         1    554457    554457   26.0548 3.807e-07
## Percent_physically_inactive          1   1440500   1440500   67.6913 4.560e-16
## percent_uninsured                    1     36332     36332    1.7073  0.191567
## num_primary_care_physicians          1     14728     14728    0.6921  0.405608
## total_covid_cases                    1   1561963   1561963   73.3991 < 2.2e-16
## Percent_population_aged_18_34        1    143274    143274    6.7327  0.009572
## Percent_population_65_older          1    152986    152986    7.1891  0.007427
## Number_active._physicians            1    225513    225513   10.5972  0.001161
## Number_hospital_beds                 1     44721     44721    2.1015  0.147395
## Total_serious_crimes                 1    126065    126065    5.9240  0.015069
## Percent_highschool_graduates         1    147734    147734    6.9423  0.008517
## Percent_bachelor_degrees             1     10584     10584    0.4973  0.480794
## Percent_below_poverty_level          1       126       126    0.0059  0.938763
## Percent_unemployment                 1    573736    573736   26.9608 2.404e-07
## Total_personal_income                1    261773    261773   12.3011  0.000468
## NE                                   1    226091    226091   10.6244  0.001145
## MW                                   1    103058    103058    4.8428  0.027935
## STH                                  1     12884     12884    0.6054  0.436649
## Residuals                         1312  27919909     21280                    
##                                      
## total_population                  ***
## population_density_per_sqmi       ***
## years_of_potential_life_lost_rate    
## percent_smokers                      
## percent_adults_with_obesity       *  
## food_environment_index               
## income_ratio                      ***
## Percent_physically_inactive       ***
## percent_uninsured                    
## num_primary_care_physicians          
## total_covid_cases                 ***
## Percent_population_aged_18_34     ** 
## Percent_population_65_older       ** 
## Number_active._physicians         ** 
## Number_hospital_beds                 
## Total_serious_crimes              *  
## Percent_highschool_graduates      ** 
## Percent_bachelor_degrees             
## Percent_below_poverty_level          
## Percent_unemployment              ***
## Total_personal_income             ***
## NE                                ** 
## MW                                *  
## STH                                  
## Residuals                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


General Linear Test Approach


Reduced Model: Multiple First Order Regression model using only the following variables:

  • total_population
  • population_density_per_sqmi
  • food_environment_index
  • income_ratio
  • Percent_physically_inactive
  • Total_serious_crimes
  • Percent_below_poverty_level
  • Percent_unemployment
  • Total_personal_income
  • NE
  • MW
multi_reg_red <- lm(total_covid_deaths ~ total_population + population_density_per_sqmi +years_of_potential_life_lost_rate    +  years_of_potential_life_lost_rate  + income_ratio + Percent_physically_inactive + Total_serious_crimes +  Percent_below_poverty_level  +   Percent_unemployment + Total_personal_income + NE + MW , data = train_df)
summary(multi_reg_red)
## 
## Call:
## lm(formula = total_covid_deaths ~ total_population + population_density_per_sqmi + 
##     years_of_potential_life_lost_rate + years_of_potential_life_lost_rate + 
##     income_ratio + Percent_physically_inactive + Total_serious_crimes + 
##     Percent_below_poverty_level + Percent_unemployment + Total_personal_income + 
##     NE + MW, data = train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1265.77   -38.22     2.38    37.38  1617.80 
## 
## Coefficients:
##                                        Estimate    Std. Error t value
## (Intercept)                       -322.61251477   34.82394932  -9.264
## total_population                     0.00064757    0.00001262  51.330
## population_density_per_sqmi          0.09743751    0.00648922  15.015
## years_of_potential_life_lost_rate   -0.00652514    0.00201440  -3.239
## income_ratio                        18.48022163    7.10799426   2.600
## Percent_physically_inactive          6.08019740    0.93136831   6.528
## Total_serious_crimes                -0.00091299    0.00021190  -4.309
## Percent_below_poverty_level          3.41258505    1.34957217   2.529
## Percent_unemployment                 7.84559659    2.30705017   3.401
## Total_personal_income                0.00515327    0.00092341   5.581
## NE                                  44.75296914   12.35442618   3.622
## MW                                  11.33119659    9.89152854   1.146
##                                          Pr(>|t|)    
## (Intercept)                               < 2e-16 ***
## total_population                          < 2e-16 ***
## population_density_per_sqmi               < 2e-16 ***
## years_of_potential_life_lost_rate        0.001228 ** 
## income_ratio                             0.009428 ** 
## Percent_physically_inactive       0.0000000000945 ***
## Total_serious_crimes              0.0000176397379 ***
## Percent_below_poverty_level              0.011566 *  
## Percent_unemployment                     0.000692 ***
## Total_personal_income             0.0000000290173 ***
## NE                                       0.000303 ***
## MW                                       0.252190    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 151.9 on 1325 degrees of freedom
## Multiple R-squared:  0.8288, Adjusted R-squared:  0.8274 
## F-statistic:   583 on 11 and 1325 DF,  p-value: < 2.2e-16



Plots: First Order Reduced Model


  • The residuals are still funnel shaped and the red line is no longer horizontal.

  • Outliers are still the same as in full model.

  • There is no improvement in the Normal QQ plot.



Analysis of Model: Additional Tests on Errors


Shapiro-Wilk Test for normality

  • Alternatives:

    • \(H_0\): if p > 0.05, Error variances are normally distributed
    • \(H_a\): if p < 0.05, Error variances are not normally distributed


  • Conclusion:

    • Errors are not normally distributed.

    • This test confirms what we found in our visual analysis of the Normal QQ plot.


shapiro.test(multi_reg_red$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  multi_reg_red$residuals
## W = 0.6221, p-value < 2.2e-16


Breush-Pagan Test (with alpha = .05.): for constancy of errors

  • Since visual analysis shows that the errors are not constant but seem to increase with the value of X, we are doing a Breusch-Pagan test to determine if this is infact so.

Alternatives:

  • \(H_0\): Error variances are constant
  • \(H_a\): Error variances are not constant


Decision Rule:

  • If p_value > 0.05, accept \(H_0\), error variances are constant
  • If p_value < 0.05, reject \(H_0\), error variances are not constant


Conclusion:

  • p_value < 0.05
  • We reject the null hypothesis, \(H_0\). The error variances are not constant.


library(lmtest)
bptest(multi_reg_red, studentize = FALSE)
## 
##  Breusch-Pagan test
## 
## data:  multi_reg_red
## BP = 6583.7, df = 11, p-value < 2.2e-16



General Linear Test with ANOVA : To compare both models


  • Anova (reduced model, full model) with \(\alpha\) = 0.10

  • Alternatives:

    • \(H_0\): Coefficients of eliminated predictors = 0; Reduced model reduces variablity significantly
    • \(H_a\): Coefficients of elimintaed predictors \(\neq\) 0 . Full model reduces variablity more than reduced model.
  • Conclusion:

    • F* = 9.6407 , F_critical = 2.397052

    • F* > F_critical, we reject the null hypothesis.


anova(multi_reg_red, multi_reg_full)
## Analysis of Variance Table
## 
## Model 1: total_covid_deaths ~ total_population + population_density_per_sqmi + 
##     years_of_potential_life_lost_rate + years_of_potential_life_lost_rate + 
##     income_ratio + Percent_physically_inactive + Total_serious_crimes + 
##     Percent_below_poverty_level + Percent_unemployment + Total_personal_income + 
##     NE + MW
## Model 2: total_covid_deaths ~ total_population + population_density_per_sqmi + 
##     years_of_potential_life_lost_rate + percent_smokers + percent_adults_with_obesity + 
##     food_environment_index + income_ratio + Percent_physically_inactive + 
##     percent_uninsured + num_primary_care_physicians + total_covid_cases + 
##     Percent_population_aged_18_34 + Percent_population_65_older + 
##     Number_active._physicians + Number_hospital_beds + Total_serious_crimes + 
##     Percent_highschool_graduates + Percent_bachelor_degrees + 
##     Percent_below_poverty_level + Percent_unemployment + Total_personal_income + 
##     NE + MW + STH + WST
##   Res.Df      RSS Df Sum of Sq      F    Pr(>F)    
## 1   1325 30586971                                  
## 2   1312 27919909 13   2667062 9.6407 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# critical value

t_critical =qt(1-.10/12, 1313 ) 
t_critical
## [1] 2.397052



Transformation on Response Variable: Box-Cox Transformation


  • To find an appropriate transformation for lambda.

  • Full model used.

  • The Box-Cox transformation suggests \(\lambda\) = 0.2363636

  • Using \(\lambda\) value to do a log transformation of Y to transform the predictor variable:

    • Y^0.2363636


## [1] 0.2363636


  • Linear Regression on transfromed variable total_covid_deaths

    • Transformation has reduced the \(R^2\) value and the F-statistic.


## 
## Call:
## lm(formula = trans ~ ., data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.6261 -0.4000  0.0092  0.4721  2.5402 
## 
## Coefficients: (1 not defined because of singularities)
##                                        Estimate    Std. Error t value  Pr(>|t|)
## (Intercept)                        2.7343771954  0.6957476188   3.930 0.0000893
## total_population                   0.0000001295  0.0000002995   0.432   0.66555
## population_density_per_sqmi        0.0003958170  0.0000356672  11.098   < 2e-16
## years_of_potential_life_lost_rate  0.0000997540  0.0000109415   9.117   < 2e-16
## percent_smokers                   -0.0396555095  0.0091696894  -4.325 0.0000164
## percent_adults_with_obesity        0.0096747126  0.0050335141   1.922   0.05481
## food_environment_index             0.0255862332  0.0225782097   1.133   0.25733
## income_ratio                       0.0116900798  0.0358412980   0.326   0.74435
## Percent_physically_inactive       -0.0171320116  0.0052345999  -3.273   0.00109
## percent_uninsured                  0.0002252495  0.0050360476   0.045   0.96433
## num_primary_care_physicians        0.0004944641  0.0002432072   2.033   0.04224
## total_covid_cases                  0.0000087679  0.0000056417   1.554   0.12039
## total_covid_deaths                 0.0011261633  0.0001321319   8.523   < 2e-16
## Percent_population_aged_18_34      0.0025003765  0.0068838685   0.363   0.71650
## Percent_population_65_older        0.0088636871  0.0072638682   1.220   0.22259
## Number_active._physicians         -0.0000604859  0.0000577638  -1.047   0.29523
## Number_hospital_beds               0.0000149010  0.0000361791   0.412   0.68050
## Total_serious_crimes              -0.0000001522  0.0000011185  -0.136   0.89180
## Percent_highschool_graduates      -0.0189318332  0.0066619392  -2.842   0.00456
## Percent_bachelor_degrees           0.0235960736  0.0053538163   4.407 0.0000113
## Percent_below_poverty_level        0.0033040602  0.0085601294   0.386   0.69957
## Percent_unemployment               0.0006071792  0.0133886900   0.045   0.96384
## Total_personal_income             -0.0000179859  0.0000081669  -2.202   0.02782
## NE                                -0.1355289936  0.0849216181  -1.596   0.11075
## MW                                -0.0986218816  0.0734194522  -1.343   0.17942
## STH                               -0.2077434765  0.0771536066  -2.693   0.00718
## WST                                          NA            NA      NA        NA
##                                      
## (Intercept)                       ***
## total_population                     
## population_density_per_sqmi       ***
## years_of_potential_life_lost_rate ***
## percent_smokers                   ***
## percent_adults_with_obesity       .  
## food_environment_index               
## income_ratio                         
## Percent_physically_inactive       ** 
## percent_uninsured                    
## num_primary_care_physicians       *  
## total_covid_cases                    
## total_covid_deaths                ***
## Percent_population_aged_18_34        
## Percent_population_65_older          
## Number_active._physicians            
## Number_hospital_beds                 
## Total_serious_crimes                 
## Percent_highschool_graduates      ** 
## Percent_bachelor_degrees          ***
## Percent_below_poverty_level          
## Percent_unemployment                 
## Total_personal_income             *  
## NE                                   
## MW                                   
## STH                               ** 
## WST                                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6982 on 1311 degrees of freedom
## Multiple R-squared:  0.6396, Adjusted R-squared:  0.6327 
## F-statistic: 93.07 on 25 and 1311 DF,  p-value: < 2.2e-16


Plots of Regression Model with Transformed Variable


  • Residuals:

    • Non-constancy of errors has increased.
  • Outliers:

    • Observations: 594, 149, 594

    • These observations could be distorting the plot results

  • Normality:

    • The Normal QQ plot has improved from the transformation




Second Order Regression Model



train_df2 <- train_df[-c(149, 1255, 594), ]
train_df2 <- train_df2[, -c(27) ]

# Centering needed.
x_crimes = (train_df2$Total_serious_crimes - mean(train_df2$Total_serious_crimes))
train_df2$x_crimes_sqr = x_crimes^2

x_income = (train_df2$Total_personal_income - mean(train_df2$Total_personal_income))
train_df2$x_income_sqr = x_income^2


multi_second_order <- lm(total_covid_deaths ~  x_crimes_sqr + x_income_sqr+ total_population + num_primary_care_physicians + Percent_population_aged_18_34 + Percent_bachelor_degrees+ Number_active._physicians + NE + percent_uninsured + total_covid_cases + years_of_potential_life_lost_rate  + Percent_physically_inactive + MW+ Percent_highschool_graduates + percent_adults_with_obesity+ population_density_per_sqmi+  food_environment_index+ Percent_unemployment +  Percent_below_poverty_level + Percent_population_65_older + WST+ STH + Number_hospital_beds, data = train_df2)


summary(multi_second_order)
## 
## Call:
## lm(formula = total_covid_deaths ~ x_crimes_sqr + x_income_sqr + 
##     total_population + num_primary_care_physicians + Percent_population_aged_18_34 + 
##     Percent_bachelor_degrees + Number_active._physicians + NE + 
##     percent_uninsured + total_covid_cases + years_of_potential_life_lost_rate + 
##     Percent_physically_inactive + MW + Percent_highschool_graduates + 
##     percent_adults_with_obesity + population_density_per_sqmi + 
##     food_environment_index + Percent_unemployment + Percent_below_poverty_level + 
##     Percent_population_65_older + WST + STH + Number_hospital_beds, 
##     data = train_df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -832.15  -35.89    3.86   40.18 1748.64 
## 
## Coefficients: (1 not defined because of singularities)
##                                            Estimate        Std. Error t value
## (Intercept)                       -291.602242308528  129.668556472700  -2.249
## x_crimes_sqr                        -0.000000003206    0.000000001455  -2.203
## x_income_sqr                         0.000000066764    0.000000021174   3.153
## total_population                     0.000178828412    0.000061537299   2.906
## num_primary_care_physicians          0.071498724141    0.049728412575   1.438
## Percent_population_aged_18_34       -1.426722585004    1.407226509227  -1.014
## Percent_bachelor_degrees             2.911555092503    1.105707272913   2.633
## Number_active._physicians            0.011092205203    0.010165861034   1.091
## NE                                  55.073344863845   12.811388936563   4.299
## percent_uninsured                    0.260078769152    1.013032086181   0.257
## total_covid_cases                    0.010861988102    0.001113259001   9.757
## years_of_potential_life_lost_rate   -0.002109022679    0.001836507259  -1.148
## Percent_physically_inactive          6.117400620614    1.010883770772   6.052
## MW                                  22.672094966584   11.444153513463   1.981
## Percent_highschool_graduates        -1.384717488152    1.349042387426  -1.026
## percent_adults_with_obesity         -1.005006518008    1.016696142233  -0.989
## population_density_per_sqmi          0.162676806674    0.009849863204  16.516
## food_environment_index              13.371721046341    4.271889319542   3.130
## Percent_unemployment                11.235904687975    2.664917265185   4.216
## Percent_below_poverty_level          2.951574347259    1.607666276548   1.836
## Percent_population_65_older          1.414834234042    1.459404623786   0.969
## WST                                  7.766065776180   15.129932545096   0.513
## STH                                              NA                NA      NA
## Number_hospital_beds                -0.005249438686    0.007019558439  -0.748
##                                        Pr(>|t|)    
## (Intercept)                             0.02469 *  
## x_crimes_sqr                            0.02779 *  
## x_income_sqr                            0.00165 ** 
## total_population                        0.00372 ** 
## num_primary_care_physicians             0.15073    
## Percent_population_aged_18_34           0.31084    
## Percent_bachelor_degrees                0.00856 ** 
## Number_active._physicians               0.27542    
## NE                                0.00001844480 ***
## percent_uninsured                       0.79743    
## total_covid_cases                       < 2e-16 ***
## years_of_potential_life_lost_rate       0.25102    
## Percent_physically_inactive       0.00000000187 ***
## MW                                      0.04779 *  
## Percent_highschool_graduates            0.30487    
## percent_adults_with_obesity             0.32309    
## population_density_per_sqmi             < 2e-16 ***
## food_environment_index                  0.00179 ** 
## Percent_unemployment              0.00002654554 ***
## Percent_below_poverty_level             0.06659 .  
## Percent_population_65_older             0.33249    
## WST                                     0.60783    
## STH                                          NA    
## Number_hospital_beds                    0.45470    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 142 on 1311 degrees of freedom
## Multiple R-squared:  0.8501, Adjusted R-squared:  0.8476 
## F-statistic: 337.9 on 22 and 1311 DF,  p-value: < 2.2e-16


Plots: Residuals vs Y, Normal QQ, Error Variances, Scale-Location, Residual vs Leverage


Predict Total Covid Deaths



Summary of Prediction and Model Results:



pred_full <- predict(multi_reg_full, newdata =  test_df)
pred_reduced <- predict(multi_reg_red, newdata =  test_df)
pred_transf <- predict(transformed_lm, newdata = test_df)

test_df2 <- test_df[, -c(17, 22)]
x_crimes = (test_df$Total_serious_crimes - mean(test_df$Total_serious_crimes))
test_df2$x_crimes_sqr = x_crimes^2

x_income = (test_df$Total_personal_income - mean(test_df$Total_personal_income))
test_df2$x_income_sqr = x_income^2


pred_second<- predict(multi_second_order, newdata =  test_df2)

full_model <- data.frame(obs = test_df$total_covid_deaths, pred = pred_full)

reduced_model <- data.frame(obs = test_df$total_covid_deaths, pred = pred_reduced)

transf_model <- data.frame(obs = test_df$total_covid_deaths, pred = pred_transf)

second_model <- data.frame(obs = test_df$total_covid_deaths, pred = pred_second)

# Get Total Sum of Squares to get MAE and MSE and R squared values for prediction results
SST <- sum((test_df$total_covid_deaths - mean(test_df$total_covid_deaths))^2)

# first order full
MAE_first <- mean(abs(pred_full - test_df$total_covid_deaths))
MSE_first <- mean((pred_full - test_df$total_covid_deaths)^2)
SSR_first <- sum((pred_full - mean(test_df$total_covid_deaths))^2)
R_squared_full <- SSR_first/SST

#  first order Reduced
MAE_red <- mean(abs(pred_reduced - test_df$total_covid_deaths))
MSE_red <- mean((pred_reduced - test_df$total_covid_deaths)^2)
SSR_red <- sum((pred_reduced - mean(test_df$total_covid_deaths))^2)
R_squared_red <- SSR_red/SST

# Transformed Y
MAE_tranf <- mean(abs(pred_transf - test_df$total_covid_deaths))
MSE_transf <- mean((pred_transf - test_df$total_covid_deaths)^2)
SSR_transf <- sum((pred_transf - mean(test_df$total_covid_deaths))^2)
R_squared_transf <- SSR_transf/SST

# Second order full
MAE_second <- mean(abs(pred_second - test_df$total_covid_deaths))
MSE_second <- mean((pred_second - test_df$total_covid_deaths)^2)
SSR_second <- sum((pred_second - mean(test_df$total_covid_deaths))^2)
R_squared_second <- SSR_second/SST

red_rsquared <- summary(multi_reg_red)$r.squared
full_rsquared <- summary(multi_reg_full)$r.squared
transf_rsquared <- summary(transformed_lm)$r.squared
multi_rsquared <- summary(multi_second_order)$r.squared

results_df <- data.frame( Model = c("Full-first order", "Reduced-first order", "Transformed Y", "Second Order"), Prediction_Rsqrd = c (R_squared_full, R_squared_red, R_squared_transf, R_squared_second), Model_Rsqrd = c(full_rsquared, red_rsquared,transf_rsquared, multi_rsquared ) )


Results

kbl(results_df) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Model Prediction_Rsqrd Model_Rsqrd
Full-first order 0.7532813 0.8437041
Reduced-first order 0.7634522 0.8287738
Transformed Y 0.1313356 0.6396143
Second Order 0.9489004 0.8500823


Conclusion

  • The second order multiple linear regression model has both the highest \(R^2\) from prediction results and from the model results.

  • The reduced model has a slightly higher predictability than the full model even though its model’s \(R^2\) is lower.

  • Transformation of the observation variable did very poorly in both prediction and model results.