#Introduction The AirQuality dataset contains daily air pollution and weather measurements (Ozone, Solar.R, Wind, Temp) in New York from May to September 1973. It’s commonly used in R for practicing data cleaning, analysis, and visualization due to its real-world values and missing data.
#Dataset Overview
data("airquality")
#display 1st five few rows
head("airquality")
## [1] "airquality"
#Load the Required Libraries # Load the Required Libraries library(dplyr) library(ggplot2) library(tidyr) library(psych) library(corrplot)
data("airquality")
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 37 7 0 0 0 0
#3️⃣Descriptive statistics
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
sum(airquality$Ozone > 100, na.rm = TRUE)
## [1] 7
Q5. Find the correlation between Ozone and Temperature.
cor(airquality$Ozone, airquality$Temp, use = "complete.obs")
## [1] 0.6983603
#Create a histogram of Ozone levels.
hist(airquality$Ozone,
main = "Distribution of Ozone Levels",
xlab = "Ozone (ppb)", col = "skyblue", border = "white")
#Boxplot of Temperature across months.
boxplot(Temp ~ Month, data = airquality,
main = "Temperature by Month",
xlab = "Month", ylab = "Temperature (°F)", col = "lightgreen")
Interpretation: The temperature increases from May to July–August and
slightly decreases in September.
#Scatter plot between Wind and Ozone.
plot(airquality$Wind, airquality$Ozone,
main = "Ozone vs Wind Speed",
xlab = "Wind (mph)", ylab = "Ozone (ppb)",
col = "coral", pch = 19)