Estimating ozone levels in the atmosphear is very important to control air pollution in our environment. R data sets repositary has airquality data consisting of
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
We notice that the dataset contains some missing data which require cleaning before building a predictive modeling.
#view missing data
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
#to know number of records
nrow(airquality)
## [1] 153
nrow(na.omit(airquality))
## [1] 111
#first four columns are selected as
# the last are month and day
raw<-na.omit(airquality)[1:4]
head(raw)
## Ozone Solar.R Wind Temp
## 1 41 190 7.4 67
## 2 36 118 8.0 72
## 3 12 149 12.6 74
## 4 18 313 11.5 62
## 7 23 299 8.6 65
## 8 19 99 13.8 59
plot(raw)
###building the model
mod.oz.temp<-lm(Ozone~Temp,data = raw)
mod.oz.temp
##
## Call:
## lm(formula = Ozone ~ Temp, data = raw)
##
## Coefficients:
## (Intercept) Temp
## -147.646 2.439
#plot the data and the fit model
plot(raw$Temp,raw$Ozone,main = "Predicting Ozone")
abline(mod.oz.temp,lwd=2,col=4)
summary(mod.oz.temp)
##
## Call:
## lm(formula = Ozone ~ Temp, data = raw)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.922 -17.459 -0.874 10.444 118.078
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -147.6461 18.7553 -7.872 2.76e-12 ***
## Temp 2.4391 0.2393 10.192 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.92 on 109 degrees of freedom
## Multiple R-squared: 0.488, Adjusted R-squared: 0.4833
## F-statistic: 103.9 on 1 and 109 DF, p-value: < 2.2e-16
#confidence intervals of the parameters
confint(mod.oz.temp)
## 2.5 % 97.5 %
## (Intercept) -184.818372 -110.473773
## Temp 1.964787 2.913433
#multiple regression model
mod.oz.all<-lm(Ozone~.,data = raw)
mod.oz.all
##
## Call:
## lm(formula = Ozone ~ ., data = raw)
##
## Coefficients:
## (Intercept) Solar.R Wind Temp
## -64.34208 0.05982 -3.33359 1.65209
dat <- cbind(raw$Ozone,predict(mod.oz.temp))
matplot(dat, type = c("b"),pch=1,col = 1:2) #plot
legend("topleft", legend = c("observed","predicted"), col=1:2, pch=1) # optional legend
#now we can use the model to predict Ozone
#given Solar.R, Wind and Temp
newdat<-data.frame(Solar.R=185,Wind=7.4,Temp=65)
predict(mod.oz.all,newdata = newdat)
## 1
## 29.44219