#Introduction The AirQuality dataset contains daily air pollution and weather measurements (Ozone, Solar.R, Wind, Temp) in New York from May to September 1973. It’s commonly used in R for practicing data cleaning, analysis, and visualization due to its real-world values and missing data.

#Dataset Overview

data("airquality")

#display 1st five few rows

head("airquality")
## [1] "airquality"

#Load the Required Libraries # Load the Required Libraries library(dplyr) library(ggplot2) library(tidyr) library(psych) library(corrplot)

Load the dataset

data("airquality")

Display first few rows

head(airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

1️⃣ Check the structure and summary of the dataset

str(airquality)
## 'data.frame':    153 obs. of  6 variables:
##  $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
##  $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
##  $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...
summary(airquality)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
## 

2️⃣ Handle missing values and count them

colSums(is.na(airquality))
##   Ozone Solar.R    Wind    Temp   Month     Day 
##      37       7       0       0       0       0

#3️⃣Descriptive statistics

summary(airquality)  
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
## 
sum(airquality$Ozone > 100, na.rm = TRUE)
## [1] 7

Q5. Find the correlation between Ozone and Temperature.

cor(airquality$Ozone, airquality$Temp, use = "complete.obs")
## [1] 0.6983603

#Create a histogram of Ozone levels.

hist(airquality$Ozone, 
     main = "Distribution of Ozone Levels",
     xlab = "Ozone (ppb)", col = "skyblue", border = "white")

#Boxplot of Temperature across months.

boxplot(Temp ~ Month, data = airquality,
        main = "Temperature by Month",
        xlab = "Month", ylab = "Temperature (°F)", col = "lightgreen")

Interpretation: The temperature increases from May to July–August and slightly decreases in September.

#Scatter plot between Wind and Ozone.

plot(airquality$Wind, airquality$Ozone,
     main = "Ozone vs Wind Speed",
     xlab = "Wind (mph)", ylab = "Ozone (ppb)",
     col = "coral", pch = 19)