Project Purpose, Contents, and Data Sources:


Purpose


Contents:


The Data:


Data Collection and Preparation:


DI_df <- read.csv("Disease_Indicators_data.csv")

DI_df2<- DI_df%>%dplyr::select("County", "Percent.of.population.aged.18.34", "Percent.of.population.65.or.older", "Number.of.active.physicians","Number.of.hospital.beds", "Total.serious.crimes" , "Percent.high.school.graduates", "Percent.bachelor.s.degrees", "Percent.below.poverty.level" , "Percent.unemployment",  "Total.personal.income", "Geographic.region")

colnames(DI_df2) <- c("county", "Percent_population_aged_18_34","Percent_population_65_older","Number_active._physicians","Number_hospital_beds", "Total_serious_crimes" , "Percent_highschool_graduates","Percent_bachelor_degrees","Percent_below_poverty_level" , "Percent_unemployment",  "Total_personal_income", "Geographic_region")

DI_df2$NE <- I(DI_df2$Geographic_region=="1")*1
DI_df2$MW <- I(DI_df2$Geographic_region=="2")*1
DI_df2$STH <- I(DI_df2$Geographic_region=="3")*1
DI_df2$WST <- I(DI_df2$Geographic_region=="4")*1


# Remove the '_' character from the county names in the  'county' column :
DI_df2$county<-gsub("_"," ",as.character(DI_df2$county))


covid_df <- read.csv("covid_data.csv")

# select columns
covid_df2<- covid_df%>%dplyr::select("date", "county" , "state" ,  "fips" ,  "lat", "lon","cases"  ,  "deaths", "stay_at_home_announced",        "stay_at_home_effective" ,  "total_population","area_sqmi" ,"population_density_per_sqmi","num_deaths",                     "years_of_potential_life_lost_rate", "percent_smokers" ,"percent_adults_with_obesity" ,"food_environment_index", "income_ratio" ,   "percent_physically_i0ctive" , "percent_uninsured" ,"num_primary_care_physicians" )
covid_df2$date <- as.Date(covid_df2$date,format = "%m/%d/%Y")


# Group by county and state - with MAX COVID CASES AND COVID DEATHS
covid_df3 <- covid_df2%>%group_by( county, state, fips, lat, lon, total_population, population_density_per_sqmi,  years_of_potential_life_lost_rate,percent_smokers,percent_adults_with_obesity,
              food_environment_index,income_ratio,percent_physically_i0ctive,percent_uninsured,
              num_primary_care_physicians)%>%summarise(total_covid_cases = max(cases), total_covid_deaths = max(deaths))

colnames(covid_df3)[13] <- "Percent_physically_inactive"

covid_df3[is.na(covid_df3)] = 0 # Replace NA with 0


Final Data Frame

  • Because of unmatched counties, there are some NA values from joining the two data frames.

  • Columns with NA values:

    • years_of_potential_life_lost_rate
    • food_environment_index
    • income_ratio
    • num_primary_care_physicians


  • Replaced NA values with 0 instead of removing the rows.


# Join the two data frames by County since what have in common
df <- left_join(covid_df3, DI_df2, by = c("county"))   # Using Max Covid cases and Covid deaths
#summary(covid_df2)

# Using Max deaths and Max Covid Cases
df <- df%>%drop_na(Percent_population_aged_18_34,  Percent_population_65_older, Number_active._physicians, Number_hospital_beds ,Total_serious_crimes, Percent_highschool_graduates ,Percent_bachelor_degrees ,Percent_below_poverty_level, Percent_unemployment,Total_personal_income)

df$total_covid_deaths <- replace(df$total_covid_deaths, df$total_covid_deaths == 0, 0.01)

kbl(head(df)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))# length(unique(df$state))
county state fips lat lon total_population population_density_per_sqmi years_of_potential_life_lost_rate percent_smokers percent_adults_with_obesity food_environment_index income_ratio Percent_physically_inactive percent_uninsured num_primary_care_physicians total_covid_cases total_covid_deaths Percent_population_aged_18_34 Percent_population_65_older Number_active._physicians Number_hospital_beds Total_serious_crimes Percent_highschool_graduates Percent_bachelor_degrees Percent_below_poverty_level Percent_unemployment Total_personal_income Geographic_region NE MW STH WST
Ada Idaho 16001 43.45110 -116.24117 425798 404.551458 5088.378 11.99070 25.6 8.1 4.478032 14.9 8.743292 428 18698 191 27.6 10.4 367 557 9701 87.2 24.9 6.2 4.1 3866 4 0 0 0 1
Adams Colorado 8001 39.87363 -104.33778 479977 411.188748 6436.698 16.32461 27.8 8.7 3.725936 19.9 11.026551 219 17503 270 29.6 7.6 439 318 19369 78.8 13.0 8.8 5.0 4271 4 0 0 0 1
Adams Idaho 16003 44.88960 -116.45384 3865 2.836063 0.000 14.35193 30.5 7.6 3.681277 21.2 14.383095 1 80 2 29.6 7.6 439 318 19369 78.8 13.0 8.8 5.0 4271 4 0 0 0 1
Adams Illinois 17001 39.98788 -91.18854 66949 78.284133 7087.094 15.45719 36.6 8.3 4.225536 27.6 5.236587 59 2726 30 29.6 7.6 439 318 19369 78.8 13.0 8.8 5.0 4271 4 0 0 0 1
Adams Indiana 18001 40.74564 -84.93662 34813 102.684660 7299.512 20.45444 34.8 8.1 3.819934 30.4 11.623897 13 823 11 29.6 7.6 439 318 19369 78.8 13.0 8.8 5.0 4271 4 0 0 0 1
Adams Iowa 19003 41.02898 -94.69918 3822 9.026107 0.000 15.63671 29.9 8.7 3.532057 24.7 6.194375 1 93 1 29.6 7.6 439 318 19369 78.8 13.0 8.8 5.0 4271 4 0 0 0 1


  • Look data in detail from final data frame, df2:
str(df, max.level = 2)
## gropd_df [1,910 x 32] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ county                           : chr [1:1910] "Ada" "Adams" "Adams" "Adams" ...
##  $ state                            : chr [1:1910] "Idaho" "Colorado" "Idaho" "Illinois" ...
##  $ fips                             : chr [1:1910] "16001" "8001" "16003" "17001" ...
##  $ lat                              : num [1:1910] 43.5 39.9 44.9 40 40.7 ...
##  $ lon                              : num [1:1910] -116.2 -104.3 -116.5 -91.2 -84.9 ...
##  $ total_population                 : int [1:1910] 425798 479977 3865 66949 34813 3822 31747 31536 2348 28111 ...
##  $ population_density_per_sqmi      : num [1:1910] 404.55 411.19 2.84 78.28 102.68 ...
##  $ years_of_potential_life_lost_rate: num [1:1910] 5088 6437 0 7087 7300 ...
##  $ percent_smokers                  : num [1:1910] 12 16.3 14.4 15.5 20.5 ...
##  $ percent_adults_with_obesity      : num [1:1910] 25.6 27.8 30.5 36.6 34.8 29.9 35.3 36.7 30.2 32.2 ...
##  $ food_environment_index           : num [1:1910] 8.1 8.7 7.6 8.3 8.1 8.7 4.9 7.7 8.9 7.1 ...
##  $ income_ratio                     : num [1:1910] 4.48 3.73 3.68 4.23 3.82 ...
##  $ Percent_physically_inactive      : num [1:1910] 14.9 19.9 21.2 27.6 30.4 24.7 31.9 25.5 27.3 36.6 ...
##  $ percent_uninsured                : num [1:1910] 8.74 11.03 14.38 5.24 11.62 ...
##  $ num_primary_care_physicians      : int [1:1910] 428 219 1 59 13 1 29 28 10 12 ...
##  $ total_covid_cases                : int [1:1910] 18698 17503 80 2726 823 93 1155 1031 109 324 ...
##  $ total_covid_deaths               : num [1:1910] 191 270 2 30 11 1 46 16 0.01 6 ...
##  $ Percent_population_aged_18_34    : num [1:1910] 27.6 29.6 29.6 29.6 29.6 29.6 29.6 29.6 29.6 29.6 ...
##  $ Percent_population_65_older      : num [1:1910] 10.4 7.6 7.6 7.6 7.6 7.6 7.6 7.6 7.6 7.6 ...
##  $ Number_active._physicians        : int [1:1910] 367 439 439 439 439 439 439 439 439 439 ...
##  $ Number_hospital_beds             : int [1:1910] 557 318 318 318 318 318 318 318 318 318 ...
##  $ Total_serious_crimes             : int [1:1910] 9701 19369 19369 19369 19369 19369 19369 19369 19369 19369 ...
##  $ Percent_highschool_graduates     : num [1:1910] 87.2 78.8 78.8 78.8 78.8 78.8 78.8 78.8 78.8 78.8 ...
##  $ Percent_bachelor_degrees         : num [1:1910] 24.9 13 13 13 13 13 13 13 13 13 ...
##  $ Percent_below_poverty_level      : num [1:1910] 6.2 8.8 8.8 8.8 8.8 8.8 8.8 8.8 8.8 8.8 ...
##  $ Percent_unemployment             : num [1:1910] 4.1 5 5 5 5 5 5 5 5 5 ...
##  $ Total_personal_income            : int [1:1910] 3866 4271 4271 4271 4271 4271 4271 4271 4271 4271 ...
##  $ Geographic_region                : int [1:1910] 4 4 4 4 4 4 4 4 4 4 ...
##  $ NE                               : 'AsIs' num [1:1910] 0 0 0 0 0 0 0 0 0 0 ...
##  $ MW                               : 'AsIs' num [1:1910] 0 0 0 0 0 0 0 0 0 0 ...
##  $ STH                              : 'AsIs' num [1:1910] 0 0 0 0 0 0 0 0 0 0 ...
##  $ WST                              : 'AsIs' num [1:1910] 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "groups")= tibble [1,032 x 15] (S3: tbl_df/tbl/data.frame)
##   ..- attr(*, ".drop")= logi TRUE


  • Map data frame:
# get state data from 'map' library
states <- map_data("state")
###########################################################################

# Add 'group' column to dataframe
    # Step1:  get county data from maps
county_map <- map_data("county")
county_map$subregion<- str_to_title(county_map$subregion)
county_map$region<- str_to_title(county_map$region)
#############################################################################

# Using covid_df2 for mapping
covid_df3_map<- covid_df2%>%dplyr::select("county"  ,"state", "fips" , "total_population", "cases", "deaths", "percent_smokers", "income_ratio" )
colnames(covid_df3_map)[2] <- "region"
colnames(covid_df3_map)[1] <- "subregion"

covid_df3_map <- covid_df3_map%>%group_by( subregion, region, fips, total_population,  percent_smokers, income_ratio)%>%summarise(total_covid_cases = max(cases), total_covid_deaths = max(deaths))

# Joion the two data frames
covid_df3_map2 <- left_join(covid_df3_map, county_map, by = c('region', 'subregion'))
covid_df3_map2 <- covid_df3_map2%>%drop_na(long ,lat, group, order)


Exploratory Data Analysis on Data



Map: Income Ratio and Percent Smokers at County level in the United States

  • Income ratio is defined as: monthly debt/monthly income.

    • A higher income ratio indicates higher personal debt than monthly personal income earned.


  • In the Southern region of the U.S.A., there are more counties that have both a high income ratio and a high percentage of smokers.


ggplot() +
  geom_polygon(data = covid_df3_map2, aes(fill = income_ratio, x = long, y = lat, group = group)) +
   geom_polygon(data = states, aes(x = long, y = lat, group = group), color = "white", fill = "transparent",  size = 0.1, alpha = 0.3)+
  theme_minimal() +
  scale_fill_viridis(option = "plasma", trans = "log", breaks=c( 2.5, 3, 3.5, 4, 4.5, 5, 5.5,  6,6.5,  7, 7.5, 8, 8.5), name="Income Ratio", guide = guide_legend( keyheight = unit(3, units = "mm"), keywidth=unit(12, units = "mm"), label.position = "bottom", title.position = 'top', nrow=1) ) +
   labs(
    title = "Income Ratio per County",
  #  subtitle = "Number of restaurant per city district",
  ) + theme(legend.position = "bottom")+
  theme(axis.line = element_blank(), axis.text = element_blank(),
        axis.ticks = element_blank(), axis.title = element_blank()) +
  coord_map()

  # 

ggplot() +
  geom_polygon(data = covid_df3_map2, aes(fill = percent_smokers, x = long, y = lat, group = group)) +
   geom_polygon(data = states, aes(x = long, y = lat, group = group), color = "white", fill = "transparent",  size = 0.1, alpha = 0.3)+
  theme_minimal() +
    scale_fill_viridis(option = 'magma',trans = "log", breaks=c( 5, 10, 15, 20, 25, 30, 35, 40, 45), name="Percent Smokers per County", guide = guide_legend( keyheight = unit(3, units = "mm"), keywidth=unit(12, units = "mm"), label.position = "bottom", title.position = 'top', nrow=1) ) +
  labs(
    title = "Percent Smokers per County",
  #  subtitle = "Number of restaurant per city district",
  ) + theme(legend.position = "bottom")+
  theme(axis.line = element_blank(), axis.text = element_blank(),
        axis.ticks = element_blank(), axis.title = element_blank()) +
  coord_map()



Histogram, Density, and Scatter Plots:




Histogram of Percent Smokers:
  • Slightly left skewed but mostly normal shaped.



Density Plots


  • Data on percent smokers, income ratio, and percent uninsured is at the county level.

  • Below are the distributions of the counties for each state of these variables.

  • Percent Smokers:

    • California, for example, has a county with 15 percent of the population which are smokers. That is one of the highest in the state.
    • Florida has a wide distribution of percent smokers across the state, but all counties have a high percentage compared to California
  • Uninsured:

    • Texas and Florida counties have a high percentage of uninsured residents.




Scatter Plots


  • Each point the the scatter plot represents a county.
    • Income Ratio: monthly debt / monthly income.

    • From the scatter plots:

      • There seems to be a linear relationship between income ratio and percent smokers. As the county’s income_ratio increases, it seems the percent of smokers also increases. Counties with larger monthly debt per household have a higher percent of smokers.

      • There also seems to be a slight parabolic as well as funnel shape relationship between percent uninsured and percent smokers when we add the variable Geographic_region (North East, Southern Region, MidWest, West).




Model: Simple Linear Regression


# Model
reg_linear <- lm(percent_smokers ~ income_ratio, data = df )
# summary(reg_linear)

# coefficients of slope and intercept
coeffs<- summary(reg_linear)$coefficients
intercept<-coeffs[1]
slope<- coeffs[2]
coeffs
##              Estimate Std. Error  t value     Pr(>|t|)
## (Intercept)  9.742844  0.5039726 19.33209 3.475382e-76
## income_ratio 1.673996  0.1093288 15.31158 5.299130e-50


Plot of regression line:

  • The plot the predictor variable, income_ratio, and percent smokers shows that linear regression function is appropriate.
  • The line is not steep but it also does not have a slope of zero. Many of the observations are spread out and far from the mean (\(y=0\)). This will most likely result in a small \(R^2\) since the Total Sum of Squares (SSTO) will be very large.



The T-Test


To see if there is linear association between the number of income_ratio and percent_smokers.


The Alternatives:

  • \(H_0\): \(b_1\) = 0. There is no relationship.
  • \(H_1\): \(b_1\) \(\neq\) 0. There is a relationship.


The Decision Rule: (for \(a\) = 0.05, \(t_a\) = 1.965395)

  • Accept \(H_0\) if t* < \(t_a\).
  • Reject \(H_0\) if t* > \(t_a\).


Conclusion:

  • For \(b_1\): We reject the null hypothesis. There is a linear relationship between income_ratio and percent_smokers

    • The t statistic for \(b_1\) = 15.31158

      • (t* = 15.31158) > (\(t_a\) = 1.96)


  • For \(b_0\): We reject the null hypothesis. The intercept is significant.

    • The t statistic for \(b_0\) = 19.33209

      • (t* = 19.33209) < (\(t_a\) = 1.96)


\(t_a\) Intercept t* S lope t*
1.965395 19.33209 15.31158


# standard of error of slope and intercept
#coef(summary(reg_linear))[, "Std. Error"]
int_ste <-  coef(summary(reg_linear))[, "Std. Error"][1]
slope_ste <- coef(summary(reg_linear))[, "Std. Error"][2]

# N-2 degrees of freedom
observations = 1910
degree_freedom = observations - 2

# T- statistic& p-value calculation
t_statistic_slope = slope/slope_ste
t_statistic_intercept = intercept/int_ste
t_statistic_slope
## income_ratio 
##     15.31158
t_statistic_intercept
## (Intercept) 
##    19.33209



The P value:


For the p-value, we need:

  • t* and degrees of freedom

  • p_value_intercept = 2 x (1 - pt(t*, degrees of freedom))


Conclusion:

  • The p-value for the slope, \(b_1\) is essentially 0.

  • This result also supports our earlier conclusion that there is a linear relationship between income_ratio and percent_smokers and it is statistically significant.

p_value_b1 = 2 * (1 - pt(t_statistic_slope, degree_freedom))

p_value_b1
## income_ratio 
##            0



90% Confidence Interval for \(b_1\)

  • Confidence interval: [1.494079 , 1.853913]

  • If we were to repeat this test 90 times, the value of \(b_1\) would fall somewhere in this interval.

  • The confidence interval for \(b_1\) does not include zero, therefore, from the confidence interval we can also conclude the slope is significant.

confint(reg_linear, "income_ratio", level = 0.90)
##                   5 %     95 %
## income_ratio 1.494079 1.853913


From Summary

  • The summary output confirms our previous results from doing the t-test, p-value, and confidence interval:

    • The predictor income ratio is significant.

    • The \(R^2\) = 0.1094 which is low.

    • A low \(R^2\) does not necessarily mean there is not a linear relationship between income_ratio and percent_smokers. The scatter plot shows a wide but linearly increasing set of observations. Also, the resulting regression line is not very steep (from previous plot), increasing slowly, therefore the Regression Sum of Squares (SSR) will naturally be low contributing to the small \(R^2\) value.

    • Variance: 10.9714

## 
## Call:
## lm(formula = percent_smokers ~ income_ratio, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.2495  -2.1196  -0.0929   2.1023   9.1150 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.7428     0.5040   19.33   <2e-16 ***
## income_ratio   1.6740     0.1093   15.31   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.312 on 1908 degrees of freedom
## Multiple R-squared:  0.1094, Adjusted R-squared:  0.109 
## F-statistic: 234.4 on 1 and 1908 DF,  p-value: < 2.2e-16
## [1] 10.9714



Study of Residuals and Other Diagnostics



Plots: Time Plot, Residuals vs Y, Normal QQ, Error Variances, Scale-Location, Residual vs Leverage


  • Time plot of residuals:

    • There is no obvious pattern in the time plot, therefore the resduals are independent.



  • Additional Plots to check regression assumptions:

    • Residual plot: errors are evenly distributed around zero. We can conclude that the errors are constant in this model.
    • Normal QQ plot: the errors are falling nicely on the diagonal line with very little departure at the tails. The errors are normally distributed
    • Residual vs Leverage: There are no points falling outside of Cook’s distance, therefore no influential points.
    • Scale Location: The residuals look well dispered around the red line and the red line is close to horizontal. We can conclude that homoscedasticity has been satisfied by the model.



ANOVA Table

  • Analysis of the Variance for the predictor:

    • F-score: 234.44
    • P score shows F-score is significant.
    • The predictor is significant in the linear model


options("scipen"=10)
anova(reg_linear)
## Analysis of Variance Table
## 
## Response: percent_smokers
##                Df  Sum Sq Mean Sq F value    Pr(>F)    
## income_ratio    1  2572.2 2572.19  234.44 < 2.2e-16 ***
## Residuals    1908 20933.4   10.97                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


Histogram of Residuals
  • The histogram of the residuals shows a slightly off normal curve, not exactly balanced around zero.

  • The difference from normality is not significant to conclude non-normality of errors.



Correlation Test:
  • p-value < 0.05
  • correlation: 0.3307999
  • Reject \(H_0\), correlation is not zero. There is a positive correlation between the two variables.


cor.test(df$income_ratio, df$percent_smokers)
## 
##  Pearson's product-moment correlation
## 
## data:  df$income_ratio and df$percent_smokers
## t = 15.312, df = 1908, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2902544 0.3701597
## sample estimates:
##       cor 
## 0.3307999



Predicting Percent Smokers: Simple Linear Regression model




predict(reg_linear, data.frame(income_ratio = 4.478032), interval = "prediction", level = 0.90, se.fit = FALSE)
##        fit      lwr      upr
## 1 17.23905 11.78669 22.69141
mae <- mean(abs(df$percent_smokers[1]-17.23905 ))
mae
## [1] 5.248349



Model: First Order Multiple Linear Regression with Three Variables





options("scipen"=10)
multi_reg <- lm(percent_smokers ~ income_ratio + percent_uninsured + percent_adults_with_obesity , data = df )
(summary(multi_reg)$sigma)^2
## [1] 7.301244
# coefficients of slope and intercept
coeffs<- summary(multi_reg)$coefficients
intercept<-coeffs[1]
slope<- coeffs[2]
summary(multi_reg)
## 
## Call:
## lm(formula = percent_smokers ~ income_ratio + percent_uninsured + 
##     percent_adults_with_obesity, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.587 -1.937 -0.326  1.861  8.244 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -0.43435    0.52612  -0.826   0.4091    
## income_ratio                 1.30683    0.09108  14.348   <2e-16 ***
## percent_uninsured            0.03641    0.01425   2.555   0.0107 *  
## percent_adults_with_obesity  0.35352    0.01169  30.253   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.702 on 1906 degrees of freedom
## Multiple R-squared:  0.408,  Adjusted R-squared:  0.407 
## F-statistic: 437.8 on 3 and 1906 DF,  p-value: < 2.2e-16


Plots with Regression Line


  • Regression line plot with percent_uninsured as predictor: From the observations’ pattern, a second order fit would be better.
  • Regression line plot with percent_adults_with_obesity as predictor. A linear regression looks like a good fit.




Study of Residuals and Other Diagnostics



Plots: Time Plot, Residuals vs Y, Normal QQ, Error Variances, Scale-Location, Residual vs Leverage


  • Time plot of residuals:

    • There is no obvious pattern in the time plot.



  • Additional Plots to check regression assumptions:

    • All plot show that the assumptions are being met with this multiple linear regression model.



Shapiro-Wilk Test: Used to test for normality of errors:


  • Alternatives:

    • \(H_0\): if p > 0.05, Error variances are normally distributed
    • \(H_a\): if p < 0.05, Error variances are not normally distributed


  • Conclusion:

    • The p-value = 0.0003605

    • The test tells us that the errors are not normally distributed.

    • This test is not agreeing with what we found in our visual analysis of the Normal QQ plot.

    • A histogram of the residuals is needed.


shapiro.test(rstandard(multi_reg))
## 
##  Shapiro-Wilk normality test
## 
## data:  rstandard(multi_reg)
## W = 0.98702, p-value = 0.000000000004059



Histogram of Residuals
  • The histogram of the residuals shows a right skewed normal curve.

  • There is less normality than in previous regression line.




ANOVA Table

  • Analysis of the Variance for each predictor shows that the following features have the highest F-score in this model:

    • Percent adults with obesity has the highest F-score.
options("scipen"=10)
anova(multi_reg)
## Analysis of Variance Table
## 
## Response: percent_smokers
##                               Df  Sum Sq Mean Sq F value         Pr(>F)    
## income_ratio                   1  2572.2  2572.2 352.294      < 2.2e-16 ***
## percent_uninsured              1   334.7   334.7  45.843 0.000000000017 ***
## percent_adults_with_obesity    1  6682.6  6682.6 915.262      < 2.2e-16 ***
## Residuals                   1906 13916.2     7.3                           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1



library(lmtest)
bptest(multi_reg, studentize = TRUE)
## 
##  studentized Breusch-Pagan test
## 
## data:  multi_reg
## BP = 39.1, df = 3, p-value = 0.00000001653


Predicting Percent Smokers: Multiple Linear Regression model




df_predict <- data.frame(income_ratio = 4.478032, percent_uninsured = 8.743292, percent_adults_with_obesity = 25.6 )

y_hat2<- predict(multi_reg, df_predict, interval = "prediction", level = 0.90, se.fit = FALSE)

mae2 <- mean(abs(df$percent_smokers[1]-y_hat2))
mae2
## [1] 3.898311



Conclusion


The multiple first order linear regression model did better at predicting than the simple first order linear regression model. Although the simple linear regression model did have all assumption met in terms of errors (from plots) and the predictor variable was significant, it has higher variance, higher MAE, and lower \(R^2\). We can conclude that the multiple linear regression model is a better model.



Model Predictors \(R^2\) Variance \(s^2\) Mean Absolute Error
simple income_ratio 0.1094 10.9714 5.248349
multiple income_ratio, percent_uninsured, percent_adults_with_obesity 0.407 7.301244 3.898311