The airquality dataset contains daily air quality measurements in New York (May–September 1973). This R Markdown file performs exploratory data analysis, visualization, missing value treatment, and statistical modeling.
data(airquality)
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
airquality_clean <- airquality %>%
drop_na()
summary(airquality_clean)
## Ozone Solar.R Wind Temp
## Min. : 1.0 Min. : 7.0 Min. : 2.30 Min. :57.00
## 1st Qu.: 18.0 1st Qu.:113.5 1st Qu.: 7.40 1st Qu.:71.00
## Median : 31.0 Median :207.0 Median : 9.70 Median :79.00
## Mean : 42.1 Mean :184.8 Mean : 9.94 Mean :77.79
## 3rd Qu.: 62.0 3rd Qu.:255.5 3rd Qu.:11.50 3rd Qu.:84.50
## Max. :168.0 Max. :334.0 Max. :20.70 Max. :97.00
## Month Day
## Min. :5.000 Min. : 1.00
## 1st Qu.:6.000 1st Qu.: 9.00
## Median :7.000 Median :16.00
## Mean :7.216 Mean :15.95
## 3rd Qu.:9.000 3rd Qu.:22.50
## Max. :9.000 Max. :31.00
ggplot(airquality, aes(x = Day, y = Ozone, color = factor(Month))) +
geom_line(na.rm = TRUE) +
labs(title = "Ozone Levels Over Days", x = "Day", y = "Ozone (ppb)", color="Month")
ggplot(airquality_clean, aes(x = Wind, y = Ozone)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Ozone vs Wind", x = "Wind (mph)", y = "Ozone (ppb)")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(airquality, aes(Temp)) +
geom_histogram(bins = 15, color = "black", fill = "lightblue") +
labs(title = "Histogram of Temperature", x = "Temperature (F)", y = "Count")
airquality$Month <- factor(airquality$Month)
ggplot(airquality, aes(x = Month, y = Ozone, fill = Month)) +
geom_boxplot(na.rm = TRUE) +
labs(title = "Ozone Levels Across Months", x = "Month", y = "Ozone (ppb)")
cor_matrix <- cor(airquality_clean)
heatmap(cor_matrix, main = "Correlation Heatmap of Air Quality Variables")
model1 <- lm(Ozone ~ Wind + Temp + Solar.R, data = airquality_clean)
summary(model1)
##
## Call:
## lm(formula = Ozone ~ Wind + Temp + Solar.R, data = airquality_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.485 -14.219 -3.551 10.097 95.619
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -64.34208 23.05472 -2.791 0.00623 **
## Wind -3.33359 0.65441 -5.094 1.52e-06 ***
## Temp 1.65209 0.25353 6.516 2.42e-09 ***
## Solar.R 0.05982 0.02319 2.580 0.01124 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.18 on 107 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.5948
## F-statistic: 54.83 on 3 and 107 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model1)
par(mfrow=c(1,1))