Introduction

The airquality dataset contains daily air quality measurements in New York (May–September 1973). This R Markdown file performs exploratory data analysis, visualization, missing value treatment, and statistical modeling.

Load Dataset

data(airquality)
head(airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

Summary Statistics

summary(airquality)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
## 

Handling Missing Values

airquality_clean <- airquality %>%
  drop_na()
summary(airquality_clean)
##      Ozone          Solar.R           Wind            Temp      
##  Min.   :  1.0   Min.   :  7.0   Min.   : 2.30   Min.   :57.00  
##  1st Qu.: 18.0   1st Qu.:113.5   1st Qu.: 7.40   1st Qu.:71.00  
##  Median : 31.0   Median :207.0   Median : 9.70   Median :79.00  
##  Mean   : 42.1   Mean   :184.8   Mean   : 9.94   Mean   :77.79  
##  3rd Qu.: 62.0   3rd Qu.:255.5   3rd Qu.:11.50   3rd Qu.:84.50  
##  Max.   :168.0   Max.   :334.0   Max.   :20.70   Max.   :97.00  
##      Month            Day       
##  Min.   :5.000   Min.   : 1.00  
##  1st Qu.:6.000   1st Qu.: 9.00  
##  Median :7.000   Median :16.00  
##  Mean   :7.216   Mean   :15.95  
##  3rd Qu.:9.000   3rd Qu.:22.50  
##  Max.   :9.000   Max.   :31.00

Plot 1: Line Plot of Ozone Over Time

ggplot(airquality, aes(x = Day, y = Ozone, color = factor(Month))) +
  geom_line(na.rm = TRUE) +
  labs(title = "Ozone Levels Over Days", x = "Day", y = "Ozone (ppb)", color="Month")

Plot 2: Scatter Plot of Ozone vs Wind

ggplot(airquality_clean, aes(x = Wind, y = Ozone)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Ozone vs Wind", x = "Wind (mph)", y = "Ozone (ppb)")
## `geom_smooth()` using formula = 'y ~ x'

Plot 3: Temperature Distribution

ggplot(airquality, aes(Temp)) +
  geom_histogram(bins = 15, color = "black", fill = "lightblue") +
  labs(title = "Histogram of Temperature", x = "Temperature (F)", y = "Count")

Boxplot: Ozone Levels by Month

airquality$Month <- factor(airquality$Month)

ggplot(airquality, aes(x = Month, y = Ozone, fill = Month)) +
  geom_boxplot(na.rm = TRUE) +
  labs(title = "Ozone Levels Across Months", x = "Month", y = "Ozone (ppb)")

Correlation Heatmap (Cleaned Data)

cor_matrix <- cor(airquality_clean)
heatmap(cor_matrix, main = "Correlation Heatmap of Air Quality Variables")

Linear Regression Model: Predicting Ozone

model1 <- lm(Ozone ~ Wind + Temp + Solar.R, data = airquality_clean)
summary(model1)
## 
## Call:
## lm(formula = Ozone ~ Wind + Temp + Solar.R, data = airquality_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.485 -14.219  -3.551  10.097  95.619 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -64.34208   23.05472  -2.791  0.00623 ** 
## Wind         -3.33359    0.65441  -5.094 1.52e-06 ***
## Temp          1.65209    0.25353   6.516 2.42e-09 ***
## Solar.R       0.05982    0.02319   2.580  0.01124 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.18 on 107 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.5948 
## F-statistic: 54.83 on 3 and 107 DF,  p-value: < 2.2e-16

Model Diagnostic Plots

par(mfrow=c(2,2))
plot(model1)

par(mfrow=c(1,1))

Conclusion