Predicting Ozone Levels in Atmos

Estimating ozone levels in the atmosphear is very important to control air pollution in our environment. R data sets repositary has airquality data consisting of

The data set

head(airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

We notice that the dataset contains some missing data which require cleaning before building a predictive modeling.

#view missing data
summary(airquality)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
## 
#to know number of records
nrow(airquality)
## [1] 153
nrow(na.omit(airquality))
## [1] 111

Selecting Variables

#first four columns are selected as
# the last are month and day
raw<-na.omit(airquality)[1:4]
head(raw)
##   Ozone Solar.R Wind Temp
## 1    41     190  7.4   67
## 2    36     118  8.0   72
## 3    12     149 12.6   74
## 4    18     313 11.5   62
## 7    23     299  8.6   65
## 8    19      99 13.8   59

visualizing the data

plot(raw)

###building the model

mod.oz.temp<-lm(Ozone~Temp,data = raw)
mod.oz.temp
## 
## Call:
## lm(formula = Ozone ~ Temp, data = raw)
## 
## Coefficients:
## (Intercept)         Temp  
##    -147.646        2.439
#plot the data and the fit model
plot(raw$Temp,raw$Ozone,main = "Predicting Ozone")
abline(mod.oz.temp,lwd=2,col=4)

summary(mod.oz.temp)
## 
## Call:
## lm(formula = Ozone ~ Temp, data = raw)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.922 -17.459  -0.874  10.444 118.078 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -147.6461    18.7553  -7.872 2.76e-12 ***
## Temp           2.4391     0.2393  10.192  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.92 on 109 degrees of freedom
## Multiple R-squared:  0.488,  Adjusted R-squared:  0.4833 
## F-statistic: 103.9 on 1 and 109 DF,  p-value: < 2.2e-16
#confidence intervals of the parameters
confint(mod.oz.temp)
##                   2.5 %      97.5 %
## (Intercept) -184.818372 -110.473773
## Temp           1.964787    2.913433
#multiple regression model
mod.oz.all<-lm(Ozone~.,data = raw)
mod.oz.all
## 
## Call:
## lm(formula = Ozone ~ ., data = raw)
## 
## Coefficients:
## (Intercept)      Solar.R         Wind         Temp  
##   -64.34208      0.05982     -3.33359      1.65209
dat <- cbind(raw$Ozone,predict(mod.oz.temp))
matplot(dat, type = c("b"),pch=1,col = 1:2) #plot
legend("topleft", legend = c("observed","predicted"), col=1:2, pch=1) # optional legend

#now we can use the model to predict Ozone
#given Solar.R, Wind and Temp 
newdat<-data.frame(Solar.R=185,Wind=7.4,Temp=65)
predict(mod.oz.all,newdata = newdat)
##        1 
## 29.44219