# Enter your name here: Joshua Gaze
# 1. I did this homework by myself, with help from the book and the professor.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
library(imputeTS)
## Warning: package 'imputeTS' was built under R version 4.1.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
The chapter on linear models (“Lining Up Our Models”) introduces linear predictive modeling using the tool known as multiple regression. The term “multiple regression” has an odd history, dating back to an early scientific observation of a phenomenon called “regression to the mean.” These days, multiple regression is just an interesting name for using linear modeling to assess the connection between one or more predictor variables and an outcome variable.
In this exercise, you will predict Ozone air levels from three predictors.
air <- airquality
glimpse(air)
## Rows: 153
## Columns: 6
## $ Ozone <int> 41, 36, 12, 18, NA, 28, 23, 19, 8, NA, 7, 16, 11, 14, 18, 14, ~
## $ Solar.R <int> 190, 118, 149, 313, NA, NA, 299, 99, 19, 194, NA, 256, 290, 27~
## $ Wind <dbl> 7.4, 8.0, 12.6, 11.5, 14.3, 14.9, 8.6, 13.8, 20.1, 8.6, 6.9, 9~
## $ Temp <int> 67, 72, 74, 62, 56, 66, 65, 59, 61, 69, 74, 69, 66, 68, 58, 64~
## $ Month <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,~
## $ Day <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,~
?airquality
## starting httpd help server ... done
# Ozone: Mean ozone in parts per billion from 1300 to 1500 hours at Roosevelt Island
# Solar.R: Solar radiation in Langleys in the frequency band 4000–7700 Angstroms from 0800 to 1200 hours at Central Park
# Wind: Average wind speed in miles per hour at 0700 and 1000 hours at LaGuardia Airport
# Temp: Maximum daily temperature in degrees Fahrenheit at La Guardia Airport.
air %>%
summarise(ozone_na=sum(is.na(Ozone)), solar_na = sum(is.na(Solar.R)), wind_na = sum(is.na(Wind)), temp_na = sum(is.na(Temp)))
## ozone_na solar_na wind_na temp_na
## 1 37 7 0 0
# 37 missing values in the Ozone field, 7 missing vlues in the Solar.R field, Wind and Temp both have zero missing values
air$Ozone <- na_interpolation(air$Ozone)
air$Solar.R <- na_interpolation(air$Solar.R)
air$Wind <- na_interpolation(air$Wind)
air$Temp <- na_interpolation(air$Temp)
air %>%
summarise(ozone_na = sum(is.na(Ozone)),
solar_na = sum(is.na(Solar.R)),
wind_na = sum(is.na(Wind)),
temp_na = sum(is.na(Temp)))
## ozone_na solar_na wind_na temp_na
## 1 0 0 0 0
ggplot(air) +
geom_point(aes(x=Solar.R, y=Ozone)) # appears to be a slightly positive, but closer to a zero, correlation relationship between Solar.R and Ozone
ggplot(air) +
geom_point(aes(x=Wind, y=Ozone)) # appears to be a negative correlation relationship between Wind and Ozone
ggplot(air) +
geom_point(aes(x=Temp, y=Ozone)) # appears to be a positive correlation relationship between Temp and Ozone
srm <- lm(Ozone ~ Wind, data = air)
srm
##
## Call:
## lm(formula = Ozone ~ Wind, data = air)
##
## Coefficients:
## (Intercept) Wind
## 89.021 -4.592
summary(srm)
##
## Call:
## lm(formula = Ozone ~ Wind, data = air)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.332 -18.332 -4.155 14.163 94.594
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 89.0205 6.6991 13.288 < 2e-16 ***
## Wind -4.5925 0.6345 -7.238 2.15e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.56 on 151 degrees of freedom
## Multiple R-squared: 0.2576, Adjusted R-squared: 0.2527
## F-statistic: 52.39 on 1 and 151 DF, p-value: 2.148e-11
# Coefficient: -4.592
# p-value: 2.148e-11
# Adjusted R-squared: 0.2527
# For every one unit increase in the independent variable Wind, we can expect a -4.5925 unit change in Ozone.
mrm <- lm(Ozone ~ Solar.R + Wind + Temp, data = air)
summary(mrm)
##
## Call:
## lm(formula = Ozone ~ Solar.R + Wind + Temp, data = air)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.651 -15.622 -4.981 12.422 101.411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -52.16596 21.90933 -2.381 0.0185 *
## Solar.R 0.01654 0.02272 0.728 0.4678
## Wind -2.69669 0.63085 -4.275 3.40e-05 ***
## Temp 1.53072 0.24115 6.348 2.49e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.26 on 149 degrees of freedom
## Multiple R-squared: 0.4321, Adjusted R-squared: 0.4207
## F-statistic: 37.79 on 3 and 149 DF, p-value: < 2.2e-16
# Adjusted R-squared: 0.4207
# The adjusted r-squared for the multiple regression model, predicting Ozone via the Solar.R, Wind, and Temp predictor variables, was 0.402 while the adjusted r-squared for the simple regression model, predicting Ozone via the Wind predictor variable, was 0.2527. Showing an improvement in the adjusted r-squared through adding the two additional preditor variables of Solar.R and Temp.
# Of the three predictor variables included in the multiple regression model, only two showed statistical significance in predicting Ozone levels. Those two being the Wind and Temp predictors.
# The coefficient of the predictor variable Wind was -2.69669
# The coefficient of the predictor variable Temp was 0.153072
predDF <- data.frame(Solar.R=290, Wind=13, Temp=61)
and use it with the predict( ) function to predict the expected value of Ozone:
predict(mrm, predDF)
## 1
## 10.9464
# The predicted value of Ozone using the multiple regression model from part G is estimated at 10.9464 using the inputted dataframe, predDF, from the first portion of part I.
Review the quality of the model by commenting on its adjusted R-Squared.
mrm_j <- lm(Temp ~ Ozone + Solar.R + Wind, data = air)
mrm_j
##
## Call:
## lm(formula = Temp ~ Ozone + Solar.R + Wind, data = air)
##
## Coefficients:
## (Intercept) Ozone Solar.R Wind
## 74.69322 0.13905 0.01575 -0.58018
summary(mrm_j)
##
## Call:
## lm(formula = Temp ~ Ozone + Solar.R + Wind, data = air)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.831 -4.802 1.174 4.880 18.004
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74.693222 2.796787 26.707 < 2e-16 ***
## Ozone 0.139055 0.021907 6.348 2.49e-09 ***
## Solar.R 0.015751 0.006737 2.338 0.02072 *
## Wind -0.580176 0.195774 -2.963 0.00354 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.313 on 149 degrees of freedom
## Multiple R-squared: 0.4148, Adjusted R-squared: 0.403
## F-statistic: 35.21 on 3 and 149 DF, p-value: < 2.2e-16
# Adjusted R-squared: 0.403
# All three predictors used (Ozone, Solar.R, Wind) are statistically significant in predicting Temp
# 40.3% of the variation in Temp can be explained by the variation amongst the variables of Ozone, Solar.R, and Wind.