Using R, build a multiple regression model for data that interests you. Include in this model at least one quadratic term, one dichotomous term, and one dichotomous vs. quantitative interaction term. Interpret all coefficients. Conduct residual analysis. Was the linear model appropriate? Why or why not?
# Read data
covid_ds <- read.csv(file = 'https://raw.githubusercontent.com/monuchacko/cuny_msds/master/data_605/full_data.csv')
#is.na(covid_ds)
covid_ds <- na.omit(covid_ds)
knitr::kable(head(covid_ds))
2019-12-31 |
Afghanistan |
0 |
0 |
0 |
0 |
2020-01-01 |
Afghanistan |
0 |
0 |
0 |
0 |
2020-01-02 |
Afghanistan |
0 |
0 |
0 |
0 |
2020-01-03 |
Afghanistan |
0 |
0 |
0 |
0 |
2020-01-04 |
Afghanistan |
0 |
0 |
0 |
0 |
2020-01-05 |
Afghanistan |
0 |
0 |
0 |
0 |
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(covid_ds)
## Observations: 6,271
## Variables: 6
## $ date <fct> 2019-12-31, 2020-01-01, 2020-01-02, 2020-01-03, 2020-0...
## $ location <fct> Afghanistan, Afghanistan, Afghanistan, Afghanistan, Af...
## $ new_cases <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ new_deaths <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ total_cases <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ total_deaths <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
# Quadratic variable
case_qv <- covid_ds$new_cases^2
# Dichotomous vs. quantative interaction
case_di_qi <- covid_ds$new_cases * covid_ds$new_deaths
covid_model <- lm(new_cases ~ total_cases + case_qv + case_di_qi, data=covid_ds)
summary(covid_model)
##
## Call:
## lm(formula = new_cases ~ total_cases + case_qv + case_di_qi,
## data = covid_ds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4926.3 -29.3 -29.3 -27.5 5476.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.928e+01 4.293e+00 6.821 9.91e-12 ***
## total_cases 2.763e-02 4.510e-04 61.262 < 2e-16 ***
## case_qv 4.965e-05 1.039e-06 47.777 < 2e-16 ***
## case_di_qi -4.919e-04 2.276e-05 -21.608 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 336.7 on 6267 degrees of freedom
## Multiple R-squared: 0.87, Adjusted R-squared: 0.8699
## F-statistic: 1.398e+04 on 3 and 6267 DF, p-value: < 2.2e-16
plot(covid_model$fitted.values, covid_model$residuals, xlab="Fitted Values", ylab="Residuals", main="Residuals vs. Fitted")
abline(h=0)

qqnorm(covid_model$residuals)
qqline(covid_model$residuals)
