Exploratory Data Analysis
1. Temperature Trends: How do temperature patterns vary across
different continents and countries?
Formatting continent, country, and temperature
# Load necessary libraries
library(readr)
library(dplyr)
library(ggplot2)
# Load the data
weather_data <- read_csv("C:\\Users\\singh\\Documents\\StatsR\\dataset\\Final\\modified_weather_repo.csv")
## Rows: 2534 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): country, continent, location_name, last_updated, condition_text, ...
## dbl (30): latitude, longitude, last_updated_epoch, temperature_celsius, tem...
## time (3): sunrise, sunset, moonset
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
weather_data$temperature_celsius <- as.numeric(weather_data$temperature_celsius)
Aggregaring Data
# Aggregate data
temperature_by_continent <- weather_data %>%
group_by(continent) %>%
summarize(avg_temp_celsius = mean(temperature_celsius, na.rm = TRUE))
temperature_by_country <- weather_data %>%
group_by(country) %>%
summarize(avg_temp_celsius = mean(temperature_celsius, na.rm = TRUE))
Visualizations
# Plot for continents
ggplot(temperature_by_continent, aes(x = reorder(continent, desc(avg_temp_celsius)), y = avg_temp_celsius, fill = continent)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Average Temperature by Continent", x = "Continent", y = "Average Temperature (C)")

# Plot for countries (Top 20 for clarity)
top_countries <- temperature_by_country %>%
arrange(desc(avg_temp_celsius)) %>%
head(10)
ggplot(top_countries, aes(x = reorder(country, avg_temp_celsius), y = avg_temp_celsius, fill = country)) +
geom_bar(stat = "identity") +
theme_minimal() +
coord_flip() +
labs(title = "Top 10 Countries by Average Temperature", x = "Country", y = "Average Temperature (C)")

2. Extreme Temperature: Which countries have extreme temperature
variations
Filtering by high and low threshold
library(dplyr)
weather_data <- read_csv("C:\\Users\\singh\\Documents\\StatsR\\dataset\\Final\\modified_weather_repo.csv")
## Rows: 2534 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): country, continent, location_name, last_updated, condition_text, ...
## dbl (30): latitude, longitude, last_updated_epoch, temperature_celsius, tem...
## time (3): sunrise, sunset, moonset
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Calculate mean and standard deviation for each country
country_stats <- weather_data %>%
group_by(country) %>%
summarize(mean_temp = mean(temperature_celsius),
sd_temp = sd(temperature_celsius))
# Define a threshold for extreme temperatures (e.g., 3 standard deviations)
extreme_threshold <- 3
# Identify extreme temperatures for each country
extreme_temps_by_country <- weather_data %>%
inner_join(country_stats, by = "country") %>%
filter(temperature_celsius > mean_temp + extreme_threshold * sd_temp |
temperature_celsius < mean_temp - extreme_threshold * sd_temp)
The extreme_threshold is a multiplier of the standard deviation. It
defines how far away from the mean a temperature must be to be
considered extreme. This threshold is a key component as it sets the
sensitivity of what is considered ‘extreme’. A higher threshold would
result in fewer data points being labeled as extreme, while a lower
threshold would include more.
Visualization
library(ggplot2)
# Plot extreme temperatures on a map
world_map <- map_data("world")
ggplot() +
geom_polygon(data = world_map, aes(x = long, y = lat, group = group), fill = "lightgrey", color = "black") +
geom_point(data = extreme_temps_by_country, aes(x = longitude, y = latitude, color = temperature_celsius), size = 3) +
geom_text(data = extreme_temps_by_country, aes(x = longitude, y = latitude, label = country), size = 3, hjust = 0, vjust = 0) +
scale_color_gradient(low = "blue", high = "red", name = "Temperature (Celsius)") +
labs(title = "Map of Extreme Temperatures", x = "Longitude", y = "Latitude") +
theme_minimal()

How air quality varies across different continents and countries
with quality metrics (like PM2.5, PM10)
weather_data <- read.csv("C:\\Users\\singh\\Documents\\StatsR\\dataset\\Final\\modified_weather_repo.csv")
# Aggregate data by continent and country
air_quality_by_continent <- weather_data %>%
group_by(continent) %>%
summarize(avg_PM2.5 = mean(air_quality_PM2.5, na.rm = TRUE),
avg_PM10 = mean(air_quality_PM10, na.rm = TRUE)) %>%
arrange(desc(avg_PM2.5))
air_quality_by_country <- weather_data %>%
group_by(country) %>%
summarize(avg_PM2.5 = mean(air_quality_PM2.5, na.rm = TRUE),
avg_PM10 = mean(air_quality_PM10, na.rm = TRUE)) %>%
arrange(desc(avg_PM2.5)) %>%
head(10) # Selecting the top 10 countries
print(air_quality_by_continent)
## # A tibble: 8 × 3
## continent avg_PM2.5 avg_PM10
## <chr> <dbl> <dbl>
## 1 Asia 42.5 68.0
## 2 America 17.0 22.4
## 3 Africa 16.7 33.9
## 4 Europe 11.2 13.9
## 5 Atlantic 4.89 18.6
## 6 Indian 3.38 7.80
## 7 Australia 3.24 4.79
## 8 Pacific 2.47 6.24
print(air_quality_by_country)
## # A tibble: 10 × 3
## country avg_PM2.5 avg_PM10
## <chr> <dbl> <dbl>
## 1 Chile 303. 365.
## 2 China 273. 308.
## 3 Indonesia 163. 211.
## 4 Malaysia 120 141.
## 5 India 83.7 121.
## 6 Nepal 82.0 103.
## 7 Pakistan 72.8 98.8
## 8 Ethiopia 70.4 214.
## 9 Vietnam 69.2 87.2
## 10 Qatar 64.4 118.
# Visualization
# For continents and oceans
ggplot(air_quality_by_continent, aes(x = reorder(continent, desc(avg_PM2.5)), y = avg_PM2.5, fill = continent)) +
geom_bar(stat = "identity") +
labs(title = "Average PM2.5 by Continent", x = "Continent", y = "Average PM2.5")

ggplot(air_quality_by_country, aes(x = reorder(country, avg_PM2.5), y = avg_PM2.5, fill = country)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 10 Countries by Average PM2.5", x = "Country", y = "Average PM2.5")

3. Examine if there’s a correlation between air quality metrics
PM2.5 and PM10
# Load necessary libraries
library(readr)
library(dplyr)
library(ggplot2)
# Correlation Analysis (Example)
correlation_analysis <- cor(weather_data$air_quality_PM2.5, weather_data$air_quality_PM10, use = "complete.obs")
print(correlation_analysis)
## [1] 0.9399293
4. Average Wind Speed across continents and countries
weather_data <- read.csv("C:\\Users\\singh\\Documents\\StatsR\\dataset\\Final\\modified_weather_repo.csv")
# 1. Average Wind Speed by Country
avg_wind_speed_country <- weather_data %>%
group_by(country) %>%
summarise(Average_Wind_Speed = mean(wind_mph, na.rm = TRUE)) %>%
arrange(desc(Average_Wind_Speed)) %>%
head(10)
# 2. Average Wind Speed by Continent
avg_wind_speed_continent <- weather_data %>%
group_by(continent) %>%
summarise(Average_Wind_Speed = mean(wind_mph, na.rm = TRUE))
# 1. Average Wind Speed by Continent
ggplot(avg_wind_speed_continent, aes(x = reorder(continent, desc(Average_Wind_Speed)), y = Average_Wind_Speed, fill = continent)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Average Wind Speed by Continent", x = "Continent", y = "Average Wind Speed (mph)")

# 2. Average Wind Speed by Country
ggplot(avg_wind_speed_country, aes(x = reorder(country, Average_Wind_Speed), y = Average_Wind_Speed, fill = country)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_minimal() +
labs(title = "Average Wind Speed by Country", x = "Country", y = "Average Wind Speed (mph)")

5. Extreme Winds: Which countries have extreme winds
weather_data <- read.csv("C:\\Users\\singh\\Documents\\StatsR\\dataset\\Final\\modified_weather_repo.csv")
# Analysis of Wind Speeds
library(dplyr)
wind_speed_columns <- grep("wind", names(weather_data), value = TRUE)
max_wind_speeds <- weather_data %>%
group_by(country, location_name, latitude, longitude) %>%
summarise(across(all_of(wind_speed_columns), max, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(all_of(wind_speed_columns), max, na.rm = TRUE)`.
## ℹ In group 1: `country = "Afghanistan"`, `location_name = "Kabul"`, `latitude =
## 34.52`, `longitude = 69.18`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
## `summarise()` has grouped output by 'country', 'location_name', 'latitude'. You
## can override using the `.groups` argument.
# Sort locations by maximum wind speed
sorted_wind_speeds <- max_wind_speeds %>%
arrange(desc(wind_mph)) # Replace 'wind_mph' with the actual column name for wind speed in mph
# Display top locations
head(sorted_wind_speeds)
## # A tibble: 6 × 8
## # Groups: country, location_name, latitude [6]
## country location_name latitude longitude wind_mph wind_kph wind_degree
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Iceland Hella 63.8 -20.4 43.8 70.6 241
## 2 Norway Oslo 59.9 10.8 36.7 59 320
## 3 Sudan Khartoum 15.6 32.5 30 48.2 235
## 4 Greece Athens 38.0 23.7 29.8 47.9 360
## 5 Maldives Farukolhufunadhoo 6.15 73.3 26.6 42.8 314
## 6 Azerbaijan Baku 40.4 49.9 25.5 41 360
## # ℹ 1 more variable: wind_direction <chr>
library(ggplot2)
library(maps)
## Warning: package 'maps' was built under R version 4.3.2
# Set your wind speed threshold (mph)
wind_speed_threshold_mph <- 25
# Filter the dataset for wind speeds above the threshold
high_wind_speeds <- subset(sorted_wind_speeds, wind_mph > wind_speed_threshold_mph)
# Get world map data
world_map <- map_data("world")
# Plot high wind speeds on a map using color gradient
ggplot() +
geom_polygon(data = world_map, aes(x = long, y = lat, group = group), fill = "lightgrey", color = "black") +
geom_point(data = high_wind_speeds, aes(x = longitude, y = latitude, color = wind_mph), size = 3) +
geom_text(data = high_wind_speeds, aes(x = longitude, y = latitude, label = country), size = 3, hjust = 0, vjust = 0) +
scale_color_gradient(name = "Wind Speed (mph)", low = "blue", high = "red") +
labs(title = "Map of High Wind Speeds by Country", x = "Longitude", y = "Latitude") +
theme_minimal()

6. Most common weather conditions across countries and
continents
Analyzing Weather Conditions by Continent and Country
weather_data <- read.csv("C:\\Users\\singh\\Documents\\StatsR\\dataset\\Final\\modified_weather_repo.csv")
# Function to calculate the mode (most common element)
calculate_mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
# For Countries
# Calculating the most common weather conditions
most_common_condition_by_country <- weather_data %>%
group_by(country) %>%
summarize(most_common_condition = calculate_mode(condition_text)) %>%
ungroup() %>%
count(most_common_condition) %>%
arrange(desc(n))
# For Continents
# Calculating the most common weather conditions
most_common_condition_by_continent <- weather_data %>%
group_by(continent) %>%
summarize(most_common_condition = calculate_mode(condition_text)) %>%
ungroup() %>%
count(most_common_condition) %>%
arrange(desc(n))
print(most_common_condition_by_country)
## # A tibble: 8 × 2
## most_common_condition n
## <chr> <int>
## 1 Partly cloudy 89
## 2 Clear 78
## 3 Sunny 8
## 4 Light rain shower 4
## 5 Mist 2
## 6 Patchy rain possible 2
## 7 Light rain 1
## 8 Overcast 1
print(most_common_condition_by_continent)
## # A tibble: 3 × 2
## most_common_condition n
## <chr> <int>
## 1 Partly cloudy 6
## 2 Clear 1
## 3 Sunny 1
# Visualization for countries
ggplot(most_common_condition_by_country, aes(x = reorder(most_common_condition, n), y = n, fill = most_common_condition)) +
geom_bar(stat = "identity") +
geom_text(aes(label = n), hjust = -0.2, vjust = 0, size = 3.5) + # Adding the count labels
coord_flip() +
labs(title = "Top Weather Conditions Across Countries", x = "Weather Condition", y = "Frequency")

# Visualization for continents
ggplot(most_common_condition_by_continent, aes(x = reorder(most_common_condition, n), y = n, fill = most_common_condition)) +
geom_bar(stat = "identity") +
geom_text(aes(label = n), hjust = -0.2, vjust = 0, size = 3.5) + # Adding the count labels
coord_flip() +
labs(title = "Top Weather Conditions Across Continents", x = "Weather Condition", y = "Frequency")

The weather condition which predominates each
continents
# Load necessary libraries
library(readr)
library(dplyr)
# Function to calculate the mode (most common element)
calculate_mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
# Calculating the most common weather condition by continent
most_common_condition_by_continent <- weather_data %>%
group_by(continent) %>%
summarize(most_common_condition = calculate_mode(condition_text))
# Display the results
print(most_common_condition_by_continent)
## # A tibble: 8 × 2
## continent most_common_condition
## <chr> <chr>
## 1 Africa Partly cloudy
## 2 America Partly cloudy
## 3 Asia Partly cloudy
## 4 Atlantic Partly cloudy
## 5 Australia Sunny
## 6 Europe Clear
## 7 Indian Partly cloudy
## 8 Pacific Partly cloudy
7. The diversity of weather conditions across countries and
continents, and to determine which regions exhibit the most varied
weather conditions
# Load necessary libraries
library(readr)
library(dplyr)
library(ggplot2)
# Calculating the diversity of weather conditions
condition_diversity_by_continent <- weather_data %>%
group_by(continent) %>%
summarize(num_unique_conditions = n_distinct(condition_text))
condition_diversity_by_country <- weather_data %>%
group_by(country) %>%
summarize(num_unique_conditions = n_distinct(condition_text))
print(condition_diversity_by_continent)
## # A tibble: 8 × 2
## continent num_unique_conditions
## <chr> <int>
## 1 Africa 20
## 2 America 18
## 3 Asia 18
## 4 Atlantic 10
## 5 Australia 4
## 6 Europe 15
## 7 Indian 9
## 8 Pacific 9
print(condition_diversity_by_country)
## # A tibble: 185 × 2
## country num_unique_conditions
## <chr> <int>
## 1 Afghanistan 4
## 2 Albania 3
## 3 Algeria 4
## 4 Andorra 5
## 5 Angola 2
## 6 Antigua and Barbuda 3
## 7 Argentina 3
## 8 Armenia 3
## 9 Australia 4
## 10 Austria 3
## # ℹ 175 more rows
# Sorting to find top countries with the most diverse weather conditions
top_countries_diversity <- condition_diversity_by_country %>%
arrange(desc(num_unique_conditions)) %>%
head(10)
# For top 10 countries
ggplot(top_countries_diversity, aes(x = reorder(country, num_unique_conditions), y = num_unique_conditions, fill = country)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 10 Countries with the Most Diverse Weather Conditions", x = "Country", y = "Number of Unique Conditions")

# Visualization
# For continents
ggplot(condition_diversity_by_continent, aes(x = reorder(continent, desc(num_unique_conditions)), y = num_unique_conditions, fill = continent)) +
geom_bar(stat = "identity") +
labs(title = "Diversity of Weather Conditions by Continent", x = "Continent", y = "Number of Unique Conditions")

8. Temporal Weather Trends Analysis
Aggregate key weather parameters (temperature, humidity, air quality)
over time intervals (e.g., daily)
# Load necessary libraries
library(readr)
library(dplyr)
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Load the data
weather_data <- read.csv("C:\\Users\\singh\\Documents\\StatsR\\dataset\\Final\\modified_weather_repo.csv")
# Convert date-time column to a proper date-time format, if necessary
weather_data$last_updated <- as.POSIXct(weather_data$last_updated, format = "%m/%d/%Y %H:%M")
# Data aggregation over time (e.g., daily averages)
daily_weather_data <- weather_data %>%
group_by(date = floor_date(last_updated, "day")) %>%
summarize(average_temperature = mean(temperature_celsius, na.rm = TRUE),
average_humidity = mean(humidity, na.rm = TRUE),
average_PM2.5 = mean(air_quality_PM2.5, na.rm = TRUE))
Visualizations
Create visualizations like line graphs to illustrate these trends
over time.
# Visualization of trends over time
ggplot(daily_weather_data) +
geom_line(aes(x = date, y = average_temperature, color = "Temperature")) +
geom_line(aes(x = date, y = average_humidity, color = "Humidity")) +
geom_line(aes(x = date, y = average_PM2.5, color = "PM2.5")) +
labs(title = "Weather Parameters Trends Over Time", x = "Date", y = "Values") +
scale_color_manual(values = c("Temperature" = "red", "Humidity" = "blue", "PM2.5" = "green")) +
theme_minimal()

9. Variance in PM2.5 with change in weather condition
# Step 1: Group the data by weather condition and calculate the average air quality for each group.
average_pm25_by_condition <- weather_data %>%
group_by(condition_text) %>%
summarise(mean_pm25 = mean(air_quality_PM2.5))
# Step 2: Visualize the relationship between weather condition and air quality.
ggplot(average_pm25_by_condition, aes(x = reorder(condition_text, desc(mean_pm25)), y = mean_pm25)) +
geom_bar(stat = "identity") +
labs(title = "Average Air Quality (PM2.5) by Weather Condition",
x = "Weather Condition",
y = "Average PM2.5") +
theme_bw() +
coord_flip()

# Print the average PM2.5 for each weather condition:
print("Average PM2.5 by weather condition:")
## [1] "Average PM2.5 by weather condition:"
print(round(average_pm25_by_condition$mean_pm25, 2))
## [1] 19.13 25.89 15.62 17.62 5.15 15.50 10.45 11.09 147.40 10.60
## [11] 23.81 14.08 8.63 8.62 14.92 6.30 3.20 13.58 16.55 29.83
## [21] 22.20 2.97
# Step 3: Calculate the correlation coefficient between air quality and weather condition using dummy variables.
df_encoded <- model.matrix(~ air_quality_PM2.5 + condition_text, data = weather_data)
correlation <- cor(df_encoded)
## Warning in cor(df_encoded): the standard deviation is zero
# Step 4: Analyze the correlation coefficients to identify positive or negative correlations.
print("Correlation coefficients:")
## [1] "Correlation coefficients:"
print(correlation)
## (Intercept) air_quality_PM2.5
## (Intercept) 1 NA
## air_quality_PM2.5 NA 1.000000000
## condition_textCloudy NA 0.007064463
## condition_textFog NA -0.008350377
## condition_textHeavy rain NA -0.001900227
## condition_textHeavy rain at times NA -0.007908307
## condition_textLight drizzle NA -0.003015242
## condition_textLight rain NA -0.029541625
## condition_textLight rain shower NA -0.026043981
## condition_textMist NA 0.374140605
## condition_textModerate or heavy rain shower NA -0.010706593
## condition_textModerate or heavy rain with thunder NA 0.007685074
## condition_textModerate rain NA -0.009099037
## condition_textModerate rain at times NA -0.007441973
## condition_textOvercast NA -0.039330029
## condition_textPartly cloudy NA -0.081889726
## condition_textPatchy light drizzle NA -0.005163053
## condition_textPatchy light rain NA -0.006316417
## condition_textPatchy light rain with thunder NA -0.012583661
## condition_textPatchy rain possible NA -0.011461315
## condition_textSunny NA 0.044550636
## condition_textThundery outbreaks possible NA 0.001684179
## condition_textTorrential rain shower NA -0.011095100
## condition_textCloudy
## (Intercept) NA
## air_quality_PM2.5 0.007064463
## condition_textCloudy 1.000000000
## condition_textFog -0.006456639
## condition_textHeavy rain -0.002625472
## condition_textHeavy rain at times -0.001855756
## condition_textLight drizzle -0.002273277
## condition_textLight rain -0.010713561
## condition_textLight rain shower -0.010105933
## condition_textMist -0.010370282
## condition_textModerate or heavy rain shower -0.003942106
## condition_textModerate or heavy rain with thunder -0.007467394
## condition_textModerate rain -0.005263442
## condition_textModerate rain at times -0.002273277
## condition_textOvercast -0.011998624
## condition_textPartly cloudy -0.054911295
## condition_textPatchy light drizzle -0.001311959
## condition_textPatchy light rain -0.001311959
## condition_textPatchy light rain with thunder -0.006722962
## condition_textPatchy rain possible -0.011129262
## condition_textSunny -0.016267231
## condition_textThundery outbreaks possible -0.002935948
## condition_textTorrential rain shower -0.002273277
## condition_textFog
## (Intercept) NA
## air_quality_PM2.5 -0.008350377
## condition_textCloudy -0.006456639
## condition_textFog 1.000000000
## condition_textHeavy rain -0.003888110
## condition_textHeavy rain at times -0.002748223
## condition_textLight drizzle -0.003366536
## condition_textLight rain -0.015865907
## condition_textLight rain shower -0.014966058
## condition_textMist -0.015357539
## condition_textModerate or heavy rain shower -0.005837936
## condition_textModerate or heavy rain with thunder -0.011058599
## condition_textModerate rain -0.007794727
## condition_textModerate rain at times -0.003366536
## condition_textOvercast -0.017768980
## condition_textPartly cloudy -0.081319131
## condition_textPatchy light drizzle -0.001942903
## condition_textPatchy light rain -0.001942903
## condition_textPatchy light rain with thunder -0.009956155
## condition_textPatchy rain possible -0.016481526
## condition_textSunny -0.024090437
## condition_textThundery outbreaks possible -0.004347898
## condition_textTorrential rain shower -0.003366536
## condition_textHeavy rain
## (Intercept) NA
## air_quality_PM2.5 -0.0019002265
## condition_textCloudy -0.0026254724
## condition_textFog -0.0038881096
## condition_textHeavy rain 1.0000000000
## condition_textHeavy rain at times -0.0011175138
## condition_textLight drizzle -0.0013689396
## condition_textLight rain -0.0064515768
## condition_textLight rain shower -0.0060856702
## condition_textMist -0.0062448584
## condition_textModerate or heavy rain shower -0.0023738884
## condition_textModerate or heavy rain with thunder -0.0044967741
## condition_textModerate rain -0.0031695811
## condition_textModerate rain at times -0.0013689396
## condition_textOvercast -0.0072254262
## condition_textPartly cloudy -0.0330669171
## condition_textPatchy light drizzle -0.0007900456
## condition_textPatchy light rain -0.0007900456
## condition_textPatchy light rain with thunder -0.0040484860
## condition_textPatchy rain possible -0.0067019071
## condition_textSunny -0.0097959297
## condition_textThundery outbreaks possible -0.0017679921
## condition_textTorrential rain shower -0.0013689396
## condition_textHeavy rain at times
## (Intercept) NA
## air_quality_PM2.5 -0.0079083072
## condition_textCloudy -0.0018557560
## condition_textFog -0.0027482226
## condition_textHeavy rain -0.0011175138
## condition_textHeavy rain at times 1.0000000000
## condition_textLight drizzle -0.0009676041
## condition_textLight rain -0.0045601516
## condition_textLight rain shower -0.0043015188
## condition_textMist -0.0044140374
## condition_textModerate or heavy rain shower -0.0016779295
## condition_textModerate or heavy rain with thunder -0.0031784434
## condition_textModerate rain -0.0022403469
## condition_textModerate rain at times -0.0009676041
## condition_textOvercast -0.0051071296
## condition_textPartly cloudy -0.0233726049
## condition_textPatchy light drizzle -0.0005584259
## condition_textPatchy light rain -0.0005584259
## condition_textPatchy light rain with thunder -0.0028615811
## condition_textPatchy rain possible -0.0047370920
## condition_textSunny -0.0069240321
## condition_textThundery outbreaks possible -0.0012496654
## condition_textTorrential rain shower -0.0009676041
## condition_textLight drizzle
## (Intercept) NA
## air_quality_PM2.5 -0.0030152421
## condition_textCloudy -0.0022732766
## condition_textFog -0.0033665364
## condition_textHeavy rain -0.0013689396
## condition_textHeavy rain at times -0.0009676041
## condition_textLight drizzle 1.0000000000
## condition_textLight rain -0.0055861255
## condition_textLight rain shower -0.0052693037
## condition_textMist -0.0054071375
## condition_textModerate or heavy rain shower -0.0020554415
## condition_textModerate or heavy rain with thunder -0.0038935512
## condition_textModerate rain -0.0027443954
## condition_textModerate rain at times -0.0011853023
## condition_textOvercast -0.0062561664
## condition_textPartly cloudy -0.0286311324
## condition_textPatchy light drizzle -0.0006840644
## condition_textPatchy light rain -0.0006840644
## condition_textPatchy light rain with thunder -0.0035053990
## condition_textPatchy rain possible -0.0058028751
## condition_textSunny -0.0084818479
## condition_textThundery outbreaks possible -0.0015308236
## condition_textTorrential rain shower -0.0011853023
## condition_textLight rain
## (Intercept) NA
## air_quality_PM2.5 -0.029541625
## condition_textCloudy -0.010713561
## condition_textFog -0.015865907
## condition_textHeavy rain -0.006451577
## condition_textHeavy rain at times -0.004560152
## condition_textLight drizzle -0.005586125
## condition_textLight rain 1.000000000
## condition_textLight rain shower -0.024833321
## condition_textMist -0.025482908
## condition_textModerate or heavy rain shower -0.009686942
## condition_textModerate or heavy rain with thunder -0.018349637
## condition_textModerate rain -0.012933863
## condition_textModerate rain at times -0.005586125
## condition_textOvercast -0.029484235
## condition_textPartly cloudy -0.134933599
## condition_textPatchy light drizzle -0.003223878
## condition_textPatchy light rain -0.003223878
## condition_textPatchy light rain with thunder -0.016520342
## condition_textPatchy rain possible -0.027347952
## condition_textSunny -0.039973489
## condition_textThundery outbreaks possible -0.007214508
## condition_textTorrential rain shower -0.005586125
## condition_textLight rain shower
## (Intercept) NA
## air_quality_PM2.5 -0.026043981
## condition_textCloudy -0.010105933
## condition_textFog -0.014966058
## condition_textHeavy rain -0.006085670
## condition_textHeavy rain at times -0.004301519
## condition_textLight drizzle -0.005269304
## condition_textLight rain -0.024833321
## condition_textLight rain shower 1.000000000
## condition_textMist -0.024037624
## condition_textModerate or heavy rain shower -0.009137539
## condition_textModerate or heavy rain with thunder -0.017308922
## condition_textModerate rain -0.012200308
## condition_textModerate rain at times -0.005269304
## condition_textOvercast -0.027812012
## condition_textPartly cloudy -0.127280727
## condition_textPatchy light drizzle -0.003041033
## condition_textPatchy light rain -0.003041033
## condition_textPatchy light rain with thunder -0.015583377
## condition_textPatchy rain possible -0.025796890
## condition_textSunny -0.037706359
## condition_textThundery outbreaks possible -0.006805331
## condition_textTorrential rain shower -0.005269304
## condition_textMist
## (Intercept) NA
## air_quality_PM2.5 0.374140605
## condition_textCloudy -0.010370282
## condition_textFog -0.015357539
## condition_textHeavy rain -0.006244858
## condition_textHeavy rain at times -0.004414037
## condition_textLight drizzle -0.005407138
## condition_textLight rain -0.025482908
## condition_textLight rain shower -0.024037624
## condition_textMist 1.000000000
## condition_textModerate or heavy rain shower -0.009376558
## condition_textModerate or heavy rain with thunder -0.017761686
## condition_textModerate rain -0.012519442
## condition_textModerate rain at times -0.005407138
## condition_textOvercast -0.028539515
## condition_textPartly cloudy -0.130610121
## condition_textPatchy light drizzle -0.003120580
## condition_textPatchy light rain -0.003120580
## condition_textPatchy light rain with thunder -0.015991005
## condition_textPatchy rain possible -0.026471681
## condition_textSunny -0.038692678
## condition_textThundery outbreaks possible -0.006983344
## condition_textTorrential rain shower -0.005407138
## condition_textModerate or heavy rain shower
## (Intercept) NA
## air_quality_PM2.5 -0.010706593
## condition_textCloudy -0.003942106
## condition_textFog -0.005837936
## condition_textHeavy rain -0.002373888
## condition_textHeavy rain at times -0.001677930
## condition_textLight drizzle -0.002055441
## condition_textLight rain -0.009686942
## condition_textLight rain shower -0.009137539
## condition_textMist -0.009376558
## condition_textModerate or heavy rain shower 1.000000000
## condition_textModerate or heavy rain with thunder -0.006751836
## condition_textModerate rain -0.004759077
## condition_textModerate rain at times -0.002055441
## condition_textOvercast -0.010848865
## condition_textPartly cloudy -0.049649461
## condition_textPatchy light drizzle -0.001186241
## condition_textPatchy light rain -0.001186241
## condition_textPatchy light rain with thunder -0.006078739
## condition_textPatchy rain possible -0.010062809
## condition_textSunny -0.014708436
## condition_textThundery outbreaks possible -0.002654613
## condition_textTorrential rain shower -0.002055441
## condition_textModerate or heavy rain with thunder
## (Intercept) NA
## air_quality_PM2.5 0.007685074
## condition_textCloudy -0.007467394
## condition_textFog -0.011058599
## condition_textHeavy rain -0.004496774
## condition_textHeavy rain at times -0.003178443
## condition_textLight drizzle -0.003893551
## condition_textLight rain -0.018349637
## condition_textLight rain shower -0.017308922
## condition_textMist -0.017761686
## condition_textModerate or heavy rain shower -0.006751836
## condition_textModerate or heavy rain with thunder 1.000000000
## condition_textModerate rain -0.009014953
## condition_textModerate rain at times -0.003893551
## condition_textOvercast -0.020550627
## condition_textPartly cloudy -0.094049244
## condition_textPatchy light drizzle -0.002247055
## condition_textPatchy light rain -0.002247055
## condition_textPatchy light rain with thunder -0.011514743
## condition_textPatchy rain possible -0.019061629
## condition_textSunny -0.027861678
## condition_textThundery outbreaks possible -0.005028540
## condition_textTorrential rain shower -0.003893551
## condition_textModerate rain
## (Intercept) NA
## air_quality_PM2.5 -0.009099037
## condition_textCloudy -0.005263442
## condition_textFog -0.007794727
## condition_textHeavy rain -0.003169581
## condition_textHeavy rain at times -0.002240347
## condition_textLight drizzle -0.002744395
## condition_textLight rain -0.012933863
## condition_textLight rain shower -0.012200308
## condition_textMist -0.012519442
## condition_textModerate or heavy rain shower -0.004759077
## condition_textModerate or heavy rain with thunder -0.009014953
## condition_textModerate rain 1.000000000
## condition_textModerate rain at times -0.002744395
## condition_textOvercast -0.014485246
## condition_textPartly cloudy -0.066291233
## condition_textPatchy light drizzle -0.001583852
## condition_textPatchy light rain -0.001583852
## condition_textPatchy light rain with thunder -0.008116243
## condition_textPatchy rain possible -0.013435715
## condition_textSunny -0.019638488
## condition_textThundery outbreaks possible -0.003544400
## condition_textTorrential rain shower -0.002744395
## condition_textModerate rain at times
## (Intercept) NA
## air_quality_PM2.5 -0.0074419729
## condition_textCloudy -0.0022732766
## condition_textFog -0.0033665364
## condition_textHeavy rain -0.0013689396
## condition_textHeavy rain at times -0.0009676041
## condition_textLight drizzle -0.0011853023
## condition_textLight rain -0.0055861255
## condition_textLight rain shower -0.0052693037
## condition_textMist -0.0054071375
## condition_textModerate or heavy rain shower -0.0020554415
## condition_textModerate or heavy rain with thunder -0.0038935512
## condition_textModerate rain -0.0027443954
## condition_textModerate rain at times 1.0000000000
## condition_textOvercast -0.0062561664
## condition_textPartly cloudy -0.0286311324
## condition_textPatchy light drizzle -0.0006840644
## condition_textPatchy light rain -0.0006840644
## condition_textPatchy light rain with thunder -0.0035053990
## condition_textPatchy rain possible -0.0058028751
## condition_textSunny -0.0084818479
## condition_textThundery outbreaks possible -0.0015308236
## condition_textTorrential rain shower -0.0011853023
## condition_textOvercast
## (Intercept) NA
## air_quality_PM2.5 -0.039330029
## condition_textCloudy -0.011998624
## condition_textFog -0.017768980
## condition_textHeavy rain -0.007225426
## condition_textHeavy rain at times -0.005107130
## condition_textLight drizzle -0.006256166
## condition_textLight rain -0.029484235
## condition_textLight rain shower -0.027812012
## condition_textMist -0.028539515
## condition_textModerate or heavy rain shower -0.010848865
## condition_textModerate or heavy rain with thunder -0.020550627
## condition_textModerate rain -0.014485246
## condition_textModerate rain at times -0.006256166
## condition_textOvercast 1.000000000
## condition_textPartly cloudy -0.151118525
## condition_textPatchy light drizzle -0.003610573
## condition_textPatchy light rain -0.003610573
## condition_textPatchy light rain with thunder -0.018501913
## condition_textPatchy rain possible -0.030628266
## condition_textSunny -0.044768203
## condition_textThundery outbreaks possible -0.008079869
## condition_textTorrential rain shower -0.006256166
## condition_textPartly cloudy
## (Intercept) NA
## air_quality_PM2.5 -0.08188973
## condition_textCloudy -0.05491130
## condition_textFog -0.08131913
## condition_textHeavy rain -0.03306692
## condition_textHeavy rain at times -0.02337260
## condition_textLight drizzle -0.02863113
## condition_textLight rain -0.13493360
## condition_textLight rain shower -0.12728073
## condition_textMist -0.13061012
## condition_textModerate or heavy rain shower -0.04964946
## condition_textModerate or heavy rain with thunder -0.09404924
## condition_textModerate rain -0.06629123
## condition_textModerate rain at times -0.02863113
## condition_textOvercast -0.15111853
## condition_textPartly cloudy 1.00000000
## condition_textPatchy light drizzle -0.01652366
## condition_textPatchy light rain -0.01652366
## condition_textPatchy light rain with thunder -0.08467338
## condition_textPatchy rain possible -0.14016921
## condition_textSunny -0.20488016
## condition_textThundery outbreaks possible -0.03697725
## condition_textTorrential rain shower -0.02863113
## condition_textPatchy light drizzle
## (Intercept) NA
## air_quality_PM2.5 -0.0051630525
## condition_textCloudy -0.0013119586
## condition_textFog -0.0019429032
## condition_textHeavy rain -0.0007900456
## condition_textHeavy rain at times -0.0005584259
## condition_textLight drizzle -0.0006840644
## condition_textLight rain -0.0032238776
## condition_textLight rain shower -0.0030410326
## condition_textMist -0.0031205796
## condition_textModerate or heavy rain shower -0.0011862411
## condition_textModerate or heavy rain with thunder -0.0022470552
## condition_textModerate rain -0.0015838518
## condition_textModerate rain at times -0.0006840644
## condition_textOvercast -0.0036105731
## condition_textPartly cloudy -0.0165236648
## condition_textPatchy light drizzle 1.0000000000
## condition_textPatchy light rain -0.0003947888
## condition_textPatchy light rain with thunder -0.0020230439
## condition_textPatchy rain possible -0.0033489686
## condition_textSunny -0.0048950635
## condition_textThundery outbreaks possible -0.0008834724
## condition_textTorrential rain shower -0.0006840644
## condition_textPatchy light rain
## (Intercept) NA
## air_quality_PM2.5 -0.0063164174
## condition_textCloudy -0.0013119586
## condition_textFog -0.0019429032
## condition_textHeavy rain -0.0007900456
## condition_textHeavy rain at times -0.0005584259
## condition_textLight drizzle -0.0006840644
## condition_textLight rain -0.0032238776
## condition_textLight rain shower -0.0030410326
## condition_textMist -0.0031205796
## condition_textModerate or heavy rain shower -0.0011862411
## condition_textModerate or heavy rain with thunder -0.0022470552
## condition_textModerate rain -0.0015838518
## condition_textModerate rain at times -0.0006840644
## condition_textOvercast -0.0036105731
## condition_textPartly cloudy -0.0165236648
## condition_textPatchy light drizzle -0.0003947888
## condition_textPatchy light rain 1.0000000000
## condition_textPatchy light rain with thunder -0.0020230439
## condition_textPatchy rain possible -0.0033489686
## condition_textSunny -0.0048950635
## condition_textThundery outbreaks possible -0.0008834724
## condition_textTorrential rain shower -0.0006840644
## condition_textPatchy light rain with thunder
## (Intercept) NA
## air_quality_PM2.5 -0.012583661
## condition_textCloudy -0.006722962
## condition_textFog -0.009956155
## condition_textHeavy rain -0.004048486
## condition_textHeavy rain at times -0.002861581
## condition_textLight drizzle -0.003505399
## condition_textLight rain -0.016520342
## condition_textLight rain shower -0.015583377
## condition_textMist -0.015991005
## condition_textModerate or heavy rain shower -0.006078739
## condition_textModerate or heavy rain with thunder -0.011514743
## condition_textModerate rain -0.008116243
## condition_textModerate rain at times -0.003505399
## condition_textOvercast -0.018501913
## condition_textPartly cloudy -0.084673377
## condition_textPatchy light drizzle -0.002023044
## condition_textPatchy light rain -0.002023044
## condition_textPatchy light rain with thunder 1.000000000
## condition_textPatchy rain possible -0.017161355
## condition_textSunny -0.025084118
## condition_textThundery outbreaks possible -0.004527240
## condition_textTorrential rain shower -0.003505399
## condition_textPatchy rain possible
## (Intercept) NA
## air_quality_PM2.5 -0.011461315
## condition_textCloudy -0.011129262
## condition_textFog -0.016481526
## condition_textHeavy rain -0.006701907
## condition_textHeavy rain at times -0.004737092
## condition_textLight drizzle -0.005802875
## condition_textLight rain -0.027347952
## condition_textLight rain shower -0.025796890
## condition_textMist -0.026471681
## condition_textModerate or heavy rain shower -0.010062809
## condition_textModerate or heavy rain with thunder -0.019061629
## condition_textModerate rain -0.013435715
## condition_textModerate rain at times -0.005802875
## condition_textOvercast -0.030628266
## condition_textPartly cloudy -0.140169214
## condition_textPatchy light drizzle -0.003348969
## condition_textPatchy light rain -0.003348969
## condition_textPatchy light rain with thunder -0.017161355
## condition_textPatchy rain possible 1.000000000
## condition_textSunny -0.041524517
## condition_textThundery outbreaks possible -0.007494441
## condition_textTorrential rain shower -0.005802875
## condition_textSunny
## (Intercept) NA
## air_quality_PM2.5 0.044550636
## condition_textCloudy -0.016267231
## condition_textFog -0.024090437
## condition_textHeavy rain -0.009795930
## condition_textHeavy rain at times -0.006924032
## condition_textLight drizzle -0.008481848
## condition_textLight rain -0.039973489
## condition_textLight rain shower -0.037706359
## condition_textMist -0.038692678
## condition_textModerate or heavy rain shower -0.014708436
## condition_textModerate or heavy rain with thunder -0.027861678
## condition_textModerate rain -0.019638488
## condition_textModerate rain at times -0.008481848
## condition_textOvercast -0.044768203
## condition_textPartly cloudy -0.204880157
## condition_textPatchy light drizzle -0.004895064
## condition_textPatchy light rain -0.004895064
## condition_textPatchy light rain with thunder -0.025084118
## condition_textPatchy rain possible -0.041524517
## condition_textSunny 1.000000000
## condition_textThundery outbreaks possible -0.010954347
## condition_textTorrential rain shower -0.008481848
## condition_textThundery outbreaks possible
## (Intercept) NA
## air_quality_PM2.5 0.0016841789
## condition_textCloudy -0.0029359476
## condition_textFog -0.0043478980
## condition_textHeavy rain -0.0017679921
## condition_textHeavy rain at times -0.0012496654
## condition_textLight drizzle -0.0015308236
## condition_textLight rain -0.0072145080
## condition_textLight rain shower -0.0068053312
## condition_textMist -0.0069833442
## condition_textModerate or heavy rain shower -0.0026546126
## condition_textModerate or heavy rain with thunder -0.0050285402
## condition_textModerate rain -0.0035443999
## condition_textModerate rain at times -0.0015308236
## condition_textOvercast -0.0080798691
## condition_textPartly cloudy -0.0369772457
## condition_textPatchy light drizzle -0.0008834724
## condition_textPatchy light rain -0.0008834724
## condition_textPatchy light rain with thunder -0.0045272398
## condition_textPatchy rain possible -0.0074944412
## condition_textSunny -0.0109543475
## condition_textThundery outbreaks possible 1.0000000000
## condition_textTorrential rain shower -0.0015308236
## condition_textTorrential rain shower
## (Intercept) NA
## air_quality_PM2.5 -0.0110951004
## condition_textCloudy -0.0022732766
## condition_textFog -0.0033665364
## condition_textHeavy rain -0.0013689396
## condition_textHeavy rain at times -0.0009676041
## condition_textLight drizzle -0.0011853023
## condition_textLight rain -0.0055861255
## condition_textLight rain shower -0.0052693037
## condition_textMist -0.0054071375
## condition_textModerate or heavy rain shower -0.0020554415
## condition_textModerate or heavy rain with thunder -0.0038935512
## condition_textModerate rain -0.0027443954
## condition_textModerate rain at times -0.0011853023
## condition_textOvercast -0.0062561664
## condition_textPartly cloudy -0.0286311324
## condition_textPatchy light drizzle -0.0006840644
## condition_textPatchy light rain -0.0006840644
## condition_textPatchy light rain with thunder -0.0035053990
## condition_textPatchy rain possible -0.0058028751
## condition_textSunny -0.0084818479
## condition_textThundery outbreaks possible -0.0015308236
## condition_textTorrential rain shower 1.0000000000
# Highlight significant correlations
significant_correlations <- correlation[abs(correlation) > 0.5]
print("Significant correlations (|r| > 0.5):")
## [1] "Significant correlations (|r| > 0.5):"
print(significant_correlations)
## [1] 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 1
## [26] NA 1 NA 1 NA 1 NA 1 NA 1 NA 1 NA 1 NA 1 NA 1 NA 1 NA 1 NA 1 NA
## [51] 1 NA 1 NA 1 NA 1 NA 1 NA 1 NA 1 NA 1 NA 1
library(ggplot2)
# Extract the correlations of air_quality_PM2.5 with other variables
air_quality_correlations <- correlation['air_quality_PM2.5', ]
# Sort the correlations by absolute value, but remove the correlation of air_quality_PM2.5 with itself
sorted_correlations <- sort(air_quality_correlations[!names(air_quality_correlations) %in% 'air_quality_PM2.5'], decreasing = TRUE, na.last = NA)
# Select the top 10 correlations
top_10_correlations <- head(sorted_correlations, 10)
# Create a data frame for visualization
top_10_df <- data.frame(Condition = names(top_10_correlations), Correlation = top_10_correlations)
# Plotting the bar chart for top 10 correlations
ggplot(top_10_df, aes(x = Condition, y = Correlation, fill = Correlation > 0)) +
geom_bar(stat = "identity") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = 'Weather Conditions', y = 'Correlation with Air Quality (PM2.5)')

Conclusion
- Weak Negative Correlation with Rain-Related Conditions:
This suggests that when rain-related weather conditions are present,
there tends to be a slight decrease in PM2.5 levels. Rain can help in
settling airborne particles, including pollutants, which might explain
this negative correlation. The weak magnitude of the correlation
indicates that while there is a relationship, it’s not very strong.
Other factors might also be influencing air quality.
- Positive Correlation with Mist, Sunny, and Cloudy Conditions:
A positive correlation with mist might indicate that in conditions
where mist is present, PM2.5 levels are higher. This could be due to
mist trapping pollutants close to the ground, preventing them from
dispersing.
Sunny conditions showing a positive correlation, as sunny weather
leads to increased human activities that contribute to air
pollution.
Cloudy weather being positively correlated could suggest that such
conditions are associated with increased pollutant levels. This might be
due to similar reasons as mist, where the dispersion of pollutants is
less efficient.
Hypothesis Testing
Based on the Exploratory Data Anlysis we would want to test the below
Hypothesis.
0. Extreme Winds affect on PM2.5
Null Hypothesis (H0): Extreme wind speed (25 mph and above) does not
affect the levels of PM2.5.
Alternative Hypothesis (H1): Extreme wind speed (25 mph and above)
has a significant effect on the levels of PM2.5.
Two-Sample T test
To test this hypothesis, we can use a two-sample t-test. The idea is
to compare the mean levels of PM2.5 under two different conditions: when
wind speed is below 25 mph and when it is 25 mph or above. This test is
suitable for comparing the means of two independent groups.
# Categorizing wind speed
weather_data$wind_speed_category <- ifelse(weather_data$wind_kph >= 25, '25 mph and above', 'Below 25 mph')
# Extracting PM2.5 levels for each category
below_25_mph_pm25 <- subset(weather_data, wind_speed_category == 'Below 25 mph')$air_quality_PM2.5
above_25_mph_pm25 <- subset(weather_data, wind_speed_category == '25 mph and above')$air_quality_PM2.5
# Performing a two-sample t-test
t_test_results <- t.test(below_25_mph_pm25, above_25_mph_pm25)
# Viewing the results
print(t_test_results)
##
## Welch Two Sample t-test
##
## data: below_25_mph_pm25 and above_25_mph_pm25
## t = 8.8336, df = 601.92, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 10.36753 16.29531
## sample estimates:
## mean of x mean of y
## 20.882167 7.550746
t-value
The t-value is quite high, suggesting a significant difference in
PM2.5 levels between the two wind speed categories.
p-value
P-Value (< 2.2e-16): The extremely small value falls below common
alpha level of 0.05, which indicates strong evidence against the null
hypothesis. Therefore the observed difference in PM2.5 levels between
the two groups is highly unlikely to have occurred by chance.
Confidence Interval
95% Confidence Interval (10.36753 to 16.29531): We can be 95%
confident that the true difference in the mean PM2.5 levels between the
two wind speed categories lies within this range. Since the interval
does not include 0, it indicates a significant difference.
Visualization
# Load necessary library for plotting
library(ggplot2)
# Creating a boxplot
ggplot(weather_data, aes(x = wind_speed_category, y = air_quality_PM2.5, fill = wind_speed_category)) +
geom_boxplot() +
labs(title = "Comparison of PM2.5 Levels by Wind Speed Categories",
x = "Wind Speed Category",
y = "PM2.5 Levels") +
theme_minimal() +
scale_fill_brewer(palette = "Set1") +
scale_y_continuous(limits = c(0, 25))
## Warning: Removed 444 rows containing non-finite values (`stat_boxplot()`).

The visualization suggests that mean PM2.5 level is significantly
higher when the wind speed is below 25 mph (20.882167) compared to when
it is 25 mph or above (7.550746). This suggests that lower wind speeds
are associated with higher concentrations of PM2.5.
This finding suggests that lower wind speeds are associated with
higher concentrations of PM2.5, which could be due to reduced dispersion
of pollutants at lower wind speeds.
1. Relationship of Humidity with Severe and Non-Severe weather
conditions
Null Hypothesis (H0): There is no statistically significant
difference in humidity levels between severe and non-severe weather
conditions.
This hypothesis posits that any observed differences in humidity
levels are due to random chance and not because of the weather condition
being severe or non-severe.
Alternative Hypothesis (H1): There is a statistically significant
difference in humidity levels between severe and non-severe weather
conditions.
This hypothesis suggests that severe weather conditions are
associated with different humidity levels compared to non-severe
conditions, implying that the weather’s severity has a tangible impact
on humidity levels.
# Load the data
weather_data <- read.csv("C:\\Users\\singh\\Documents\\StatsR\\dataset\\Final\\modified_weather_repo.csv")
Defining Severe and Non-Severe weather
conditions
We are using the criteria such as high wind speeds, high
precipitation, low visibility, and specific weather conditions to define
the serverity and non serverity.
# Define thresholds for severe weather
wind_speed_threshold <- quantile(weather_data$wind_kph, 0.75)
precipitation_threshold <- quantile(weather_data$precip_mm, 0.75)
visibility_threshold <- quantile(weather_data$visibility_km, 0.25)
# List of severe weather conditions
severe_conditions <- c('Patchy light rain with thunder', 'Mist', 'Moderate or heavy rain with thunder', 'Fog', 'Moderate or heavy rain shower', 'Heavy rain', 'Heavy rain at times')
# Mark rows as 'severe' or 'non-severe'
weather_data$is_severe <- with(weather_data,
wind_kph > wind_speed_threshold |
precip_mm > precipitation_threshold |
visibility_km < visibility_threshold |
condition_text %in% severe_conditions)
percentage_severity <- sum(weather_data$is_severe, na.rm = TRUE) / nrow(weather_data[!is.na(weather_data$is_severe),])
print(percentage_severity)
## [1] 0.4747435
Approximately 47% of the weather conditions in the dataset are
classified as “severe” based on the defined criteria. This
classification should provide a good basis for comparing humidity levels
between severe and non-severe weather conditions.
We’ll first check the distribution of humidity data to decide on the
appropriate statistical test (parametric or non-parametric).
Normality Test
The Shapiro-Wilk test is performed on humidity data for both severe
and non-severe weather to assess the normality of the data.
This step is essential to decide whether to use parametric or
non-parametric tests for further analysis.
# Performing Shapiro-Wilk Test for normality on humidity data
# For severe weather conditions
shapiro_test_severe <- shapiro.test(weather_data$humidity[weather_data$is_severe])
# For non-severe weather conditions
shapiro_test_non_severe <- shapiro.test(weather_data$humidity[!weather_data$is_severe])
# Printing the results
shapiro_test_severe
##
## Shapiro-Wilk normality test
##
## data: weather_data$humidity[weather_data$is_severe]
## W = 0.90065, p-value < 2.2e-16
shapiro_test_non_severe
##
## Shapiro-Wilk normality test
##
## data: weather_data$humidity[!weather_data$is_severe]
## W = 0.93146, p-value < 2.2e-16
Based on the results we would proceed with the non-parametric
test.
Mann-Whitney U test
The Mann-Whitney U test is conducted to compare humidity levels
between severe and non-severe weather conditions.
# Splitting the data into severe and non-severe for humidity
humidity_severe <- weather_data$humidity[weather_data$is_severe]
humidity_non_severe <- weather_data$humidity[!weather_data$is_severe]
# Mann-Whitney U test
test_result <- wilcox.test(humidity_severe, humidity_non_severe)
# Print the test result
test_result
##
## Wilcoxon rank sum test with continuity correction
##
## data: humidity_severe and humidity_non_severe
## W = 997298, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
A very low p-value indicates a significant difference, leading to the
rejection of the null hypothesis and acceptance of the alternative
hypothesis: severe weather conditions are associated with different
humidity levels compared to non-severe conditions.
Visualization of the hypothesis tests
Two types of plots are created using ggplot2 for visual
representation of the hypothesis test results:
- A boxplot showing the distribution of humidity levels in severe and
non-severe weather conditions.
- A histogram illustrating the overall distribution of humidity
levels, comparing severe and non-severe weather conditions.
# Boxplot
ggplot(weather_data, aes(x = factor(is_severe), y = humidity, fill = factor(is_severe))) +
geom_boxplot() +
labs(title = "Humidity Levels in Severe and Non-Severe Weather Conditions",
x = "Weather Condition", y = "Humidity") +
scale_fill_discrete(name = "Weather Condition", labels = c("Non-Severe", "Severe"))

# Histogram
ggplot(weather_data, aes(x = humidity, fill = factor(is_severe))) +
geom_histogram(alpha = 0.6, position = 'identity', bins = 30) +
labs(title = "Distribution of Humidity Levels in Severe and Non-Severe Weather Conditions",
x = "Humidity", y = "Count") +
scale_fill_discrete(name = "Weather Condition", labels = c("Non-Severe", "Severe"))

2. Relationship of PM2.5 with Severe and Non-Severe weather
conditions
Null Hypothesis (H0): There is no statistically significant
difference in PM2.5 levels between severe and non-severe weather
conditions.
This hypothesis assumes that any observed differences in PM2.5 levels
across these two categories are due to random variation and not
inherently linked to the severity of the weather conditions.
Alternative Hypothesis (H1): There is a statistically significant
difference in PM2.5 levels between severe and non-severe weather
conditions.
This hypothesis suggests that the severity of weather conditions does
have an effect on PM2.5 levels, indicating a relationship between
weather condition severity and air quality as measured by PM2.5
concentrations.
# Define thresholds for severe weather
wind_speed_threshold <- quantile(weather_data$wind_kph, 0.75)
precipitation_threshold <- quantile(weather_data$precip_mm, 0.75)
visibility_threshold <- quantile(weather_data$visibility_km, 0.25)
# List of severe weather conditions
severe_conditions <- c('Patchy light rain with thunder', 'Mist', 'Moderate or heavy rain with thunder', 'Fog', 'Moderate or heavy rain shower', 'Heavy rain', 'Heavy rain at times')
# Mark rows as 'severe' or 'non-severe'
weather_data$is_severe <- with(weather_data,
wind_kph > wind_speed_threshold |
precip_mm > precipitation_threshold |
visibility_km < visibility_threshold |
condition_text %in% severe_conditions)
Normality Test
# Performing Shapiro-Wilk Test for normality on PM2.5 data
# For severe weather conditions
shapiro_test_severe <- shapiro.test(weather_data$air_quality_PM2.5[weather_data$is_severe])
# For non-severe weather conditions
shapiro_test_non_severe <- shapiro.test(weather_data$air_quality_PM2.5[!weather_data$is_severe])
# Printing the results
print(shapiro_test_severe)
##
## Shapiro-Wilk normality test
##
## data: weather_data$air_quality_PM2.5[weather_data$is_severe]
## W = 0.27794, p-value < 2.2e-16
print(shapiro_test_non_severe)
##
## Shapiro-Wilk normality test
##
## data: weather_data$air_quality_PM2.5[!weather_data$is_severe]
## W = 0.4241, p-value < 2.2e-16
Both p-values are significantly less than 0.05 (a common alpha level
for statistical tests). This indicates that the PM2.5 data for both
severe and non-severe weather conditions do not follow a normal
distribution.
Based on the results we would proceed with the non-parametric
test.
Mann-Whitney U test
The Mann-Whitney U test, a non-parametric test suitable for data that
is not normally distributed.
# Splitting the data into severe and non-severe for PM2.5
pm25_severe <- weather_data$air_quality_PM2.5[weather_data$is_severe]
pm25_non_severe <- weather_data$air_quality_PM2.5[!weather_data$is_severe]
# Mann-Whitney U test
test_result <- wilcox.test(pm25_severe, pm25_non_severe)
# Print the test result
print(test_result)
##
## Wilcoxon rank sum test with continuity correction
##
## data: pm25_severe and pm25_non_severe
## W = 657915, p-value = 8.62e-15
## alternative hypothesis: true location shift is not equal to 0
Based on these tests, we reject the null hypothesis (that there is no
difference in PM2.5 levels between severe and non-severe weather
conditions) in favor of the alternative hypothesis (that there is a
difference in PM2.5 levels between these two categories of weather
conditions). This suggests that the severity of the weather conditions
has a significant impact on PM2.5 air quality levels.
# Boxplot for PM2.5
ggplot(weather_data, aes(x = factor(is_severe), y = air_quality_PM2.5, fill = factor(is_severe))) +
geom_boxplot() +
labs(title = "PM2.5 Levels in Severe and Non-Severe Weather Conditions",
x = "Weather Condition", y = "PM2.5") +
scale_fill_discrete(name = "Weather Condition", labels = c("Non-Severe", "Severe")) +
scale_y_continuous(limits = c(0, 50))
## Warning: Removed 182 rows containing non-finite values (`stat_boxplot()`).

# Histogram for PM2.5
ggplot(weather_data, aes(x = air_quality_PM2.5, fill = factor(is_severe))) +
geom_histogram(alpha = 4, position = 'identity', bins = 30) +
labs(title = "Distribution of PM2.5 Levels in Severe and Non-Severe Weather Conditions",
x = "PM2.5", y = "Count") +
scale_fill_discrete(name = "Weather Condition", labels = c("Non-Severe", "Severe"))

Looking at the visualization we can notice that this test does not
necessarily imply a large or practically significant difference.
3. Relationship of Air Quality variables with various explanatory
vairables
1. Temperature
Null Hypothesis (H0): The mean levels of PM2.5 are the same across
all temperature categories (High, Medium, Low).
This hypothesis suggests that the temperature category does not
significantly influence PM2.5 levels.
Alternative Hypothesis (H1): There is a significant difference in
mean PM2.5 levels between at least one pair of temperature categories.
(High, Medium, Low).
This implies that temperature category impacts PM2.5 levels.
2. Humidity
Null Hypothesis (H0): The mean levels of PM2.5 are the same across
all humidity categories (High, Medium, Low).
This hypothesis posits that the humidity category does not have a
significant effect on PM2.5 levels.
Alternative Hypothesis (H1): There is a significant difference in
mean PM2.5 levels between at least one pair of humidity categories
(High, Medium, Low).
This suggests that humidity category influences PM2.5 levels.
3. Wind Speed
Null Hypothesis (H0): The mean levels of PM2.5 are the same across
all wind speed categories.
This hypothesis indicates that the wind speed category does not
significantly impact PM2.5 levels.
Alternative Hypothesis (H1): There is a significant difference in
mean PM2.5 levels between at least one pair of wind speed
categories.
This implies that wind speed category affects PM2.5 levels.
Categorizing into Low, Medium, High
# Select relevant columns (air pollutants and meteorological variables)
weather_data_subset <- weather_data[c('air_quality_PM2.5', 'air_quality_PM10', 'temperature_celsius', 'humidity', 'wind_kph')]
# Function to categorize data into 'Low', 'Medium', 'High'
categorize <- function(data) {
quantiles <- quantile(data, c(0.33, 0.66))
return(cut(data, breaks=c(-Inf, quantiles[1], quantiles[2], Inf), labels=c("Low", "Medium", "High")))
}
# Applying the categorization
weather_data_subset$temperature_category <- categorize(weather_data_subset$temperature_celsius)
weather_data_subset$humidity_category <- categorize(weather_data_subset$humidity)
weather_data_subset$wind_speed_category <- categorize(weather_data_subset$wind_kph)
ANOVA Test
# Load necessary library
library(stats)
# Function to perform ANOVA
perform_anova <- function(df, dependent_var, independent_var) {
return(aov(reformulate(independent_var, dependent_var), data = df))
}
# Performing ANOVA for PM2.5
anova_results_PM2.5_temperature <- perform_anova(weather_data_subset, 'air_quality_PM2.5', 'temperature_category')
anova_results_PM2.5_humidity <- perform_anova(weather_data_subset, 'air_quality_PM2.5', 'humidity_category')
anova_results_PM2.5_wind_speed <- perform_anova(weather_data_subset, 'air_quality_PM2.5', 'wind_speed_category')
# Performing ANOVA for PM10
anova_results_PM10_temperature <- perform_anova(weather_data_subset, 'air_quality_PM10', 'temperature_category')
anova_results_PM10_humidity <- perform_anova(weather_data_subset, 'air_quality_PM10', 'humidity_category')
anova_results_PM10_wind_speed <- perform_anova(weather_data_subset, 'air_quality_PM10', 'wind_speed_category')
# To view results, use summary() function, e.g.,
summary(anova_results_PM2.5_temperature)
## Df Sum Sq Mean Sq F value Pr(>F)
## temperature_category 2 41372 20686 7.286 0.000699 ***
## Residuals 2531 7185684 2839
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(anova_results_PM2.5_humidity)
## Df Sum Sq Mean Sq F value Pr(>F)
## humidity_category 2 23304 11652 4.094 0.0168 *
## Residuals 2531 7203751 2846
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(anova_results_PM2.5_wind_speed)
## Df Sum Sq Mean Sq F value Pr(>F)
## wind_speed_category 2 144528 72264 25.82 7.9e-12 ***
## Residuals 2531 7082528 2798
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Visualization
PM2.5 levels across Temperature categories
# Load necessary library for plotting
library(ggplot2)
# Boxplot for PM2.5 levels across Temperature categories
ggplot(weather_data_subset, aes(x = temperature_category, y = air_quality_PM2.5, fill = temperature_category)) +
geom_boxplot() + labs(title = "PM2.5 Levels Across Temperature Categories", x = "Temperature Category", y = "PM2.5 Levels") +
theme_minimal() +
scale_y_continuous(limits = c(0, 40)) # Zoom in on the y-axis to focus on the data distribution
## Warning: Removed 243 rows containing non-finite values (`stat_boxplot()`).

# Scatter plot for PM2.5 levels vs. Temperature
ggplot(weather_data_subset, aes(x = temperature_celsius, y = air_quality_PM2.5)) +
geom_point(aes(color = temperature_category)) +
labs(title = "PM2.5 Levels vs. Temperature", x = "Temperature (Celsius)", y = "PM2.5 Levels") +
theme_minimal()

The very low p-value 0.000699 indicates that there are statistically
significant differences between groups within the temperature category
in terms of their impact on the PM2.5 levels. The F value suggests a
moderate degree of variation between groups.
This result leads to the rejection of the null hypothesis (H0) for
the temperature category.
PM2.5 levels across Humidity categories
# Boxplot for PM2.5 levels across Humidity categories
ggplot(weather_data_subset, aes(x = humidity_category, y = air_quality_PM2.5, fill = humidity_category)) +
geom_boxplot() + labs(title = "PM2.5 Levels Across Humidity Categories", x = "Humidity Category", y = "PM2.5 Levels") +
theme_minimal() +
scale_y_continuous(limits = c(0, 40))
## Warning: Removed 243 rows containing non-finite values (`stat_boxplot()`).

ggplot(weather_data_subset, aes(x = humidity, y = air_quality_PM2.5)) +
geom_point(aes(color = humidity_category)) +
labs(title = "PM2.5 Levels vs. Humidity", x = "Humidity", y = "PM2.5 Levels") +
theme_minimal()

This result shows that there are statistically significant
differences between groups within the humidity category, but the level
of significance is less than that of the temperature category (as
indicated by a higher p-value). The F value is lower, indicating less
variation between groups compared to the temperature category.
Similar to the temperature category, the null hypothesis (H0) for the
humidity category is rejected implying humidity influences PM2.5
levels.
PM2.5 levels across Wind Speed categories
# Boxplot for PM2.5 levels across Wind Speed categories
ggplot(weather_data_subset, aes(x = wind_speed_category, y = air_quality_PM2.5, fill = wind_speed_category)) +
geom_boxplot() + labs(title = "PM2.5 Levels Across Wind Speed Categories", x = "Wind Speed Category", y = "PM2.5 Levels") +
theme_minimal() +
scale_y_continuous(limits = c(0, 40))
## Warning: Removed 243 rows containing non-finite values (`stat_boxplot()`).

ggplot(weather_data_subset, aes(x = wind_kph, y = air_quality_PM2.5)) +
geom_point(aes(color = wind_speed_category)) +
labs(title = "PM2.5 Levels vs. Wind Speed", x = "Wind Speed", y = "PM2.5 Levels") +
theme_minimal()

This result indicates a very strong statistical significance, with a
much lower p-value compared to the other two categories. The high F
value suggests a substantial degree of variation between groups within
the wind speed category.
The null hypothesis (H0) for the wind speed category is strongly
rejected.
This outcome indicates a very significant difference in PM2.5 levels
across different wind speed categories, showing that wind speed has a
substantial impact on PM2.5 levels.
Conclusion
The strength of these effects varies, with wind speed showing the
most substantial effect (highest F value and lowest p-value), followed
by temperature and then humidity.
Air Quality Variation Across Continents
Null Hypothesis (H0): There is no significant difference in the mean
levels of specific air pollutants (like PM2.5, PM10) across
continents.
Alternative Hypothesis (H1): There is a significant difference in the
mean levels of specific air pollutants across continents.
# Select relevant columns for the new hypothesis test
data_for_hypothesis <- weather_data[c('air_quality_PM2.5', 'air_quality_PM10', 'continent')]
# Performing ANOVA for PM2.5 and PM10 across continents
anova_results_PM2.5_continent <- perform_anova(data_for_hypothesis, 'air_quality_PM2.5', 'continent')
# Viewing the results
summary(anova_results_PM2.5_continent)
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 7 444412 63487 23.64 <2e-16 ***
## Residuals 2526 6782644 2685
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Boxplot for PM2.5 levels across continents
ggplot(data_for_hypothesis, aes(x = continent, y = air_quality_PM2.5, fill = continent)) +
geom_boxplot() +
labs(title = "PM2.5 Levels Across Continents", x = "Continent", y = "PM2.5 Levels") +
theme_minimal() +
scale_y_continuous(limits = c(0, 40))
## Warning: Removed 243 rows containing non-finite values (`stat_boxplot()`).

The p-value is significantly less than 0.05. This indicates that the
test results are statistically significant.
The F-values are quite high (23.64 for PM2.5, respectively), which
strengthens the evidence against the null hypothesis.
Given these results, we reject the Null Hypothesis (H0). This means
there is sufficient evidence to support the Alternative Hypothesis (H1)
that there are significant differences in the mean levels of the air
pollutant across different continents.
4. Air Quality (CO, O3, NO2, SO2) variation across Continents
For each air pollutant (Carbon Monoxide, Ozone, Nitrogen Dioxide,
Sulphur Dioxide):
Null Hypothesis (H0): There is no significant difference in the mean
levels of the specific air pollutant across continents.
Alternative Hypothesis (H1): There is a significant difference in the
mean levels of the specific air pollutant across continents.
# Data Preparation
data_for_analysis <- weather_data[c('air_quality_Carbon_Monoxide', 'air_quality_Ozone', 'air_quality_Nitrogen_dioxide', 'air_quality_Sulphur_dioxide', 'continent')]
# ANOVA Tests
# Perform_anova function defined previously for each pollutant
anova_results_CO <- perform_anova(data_for_analysis, 'air_quality_Carbon_Monoxide', 'continent')
anova_results_Ozone <- perform_anova(data_for_analysis, 'air_quality_Ozone', 'continent')
anova_results_NO2 <- perform_anova(data_for_analysis, 'air_quality_Nitrogen_dioxide', 'continent')
anova_results_SO2 <- perform_anova(data_for_analysis, 'air_quality_Sulphur_dioxide', 'continent')
# Viewing the results
summary(anova_results_CO)
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 7 9.584e+07 13691627 15.92 <2e-16 ***
## Residuals 2526 2.172e+09 860052
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(anova_results_Ozone)
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 7 24832 3547 3.266 0.00186 **
## Residuals 2526 2743957 1086
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(anova_results_NO2)
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 7 96753 13822 42.86 <2e-16 ***
## Residuals 2526 814618 322
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(anova_results_SO2)
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 7 59789 8541 48.1 <2e-16 ***
## Residuals 2526 448511 178
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Visualization: Boxplot for each pollutant across continents
ggplot(data_for_analysis, aes(x = continent, y = air_quality_Carbon_Monoxide, fill = continent)) +
geom_boxplot() +
labs(title = "Carbon Monoxide Levels Across Continents", x = "Continent", y = "Carbon Monoxide Levels") +
theme_minimal() +
scale_y_continuous(limits = c(0, 1000))
## Warning: Removed 166 rows containing non-finite values (`stat_boxplot()`).

ggplot(data_for_analysis, aes(x = continent, y = air_quality_Ozone, fill = continent)) +
geom_boxplot() +
labs(title = "Ozone Levels Across Continents", x = "Continent", y = "Ozone") +
theme_minimal() +
scale_y_continuous(limits = c(0, 250))
## Warning: Removed 4 rows containing non-finite values (`stat_boxplot()`).

ggplot(data_for_analysis, aes(x = continent, y = air_quality_Nitrogen_dioxide, fill = continent)) +
geom_boxplot() +
labs(title = "Nitrogen Levels Across Continents", x = "Continent", y = "Nitrogen") +
theme_minimal() +
scale_y_continuous(limits = c(0, 75))
## Warning: Removed 49 rows containing non-finite values (`stat_boxplot()`).

ggplot(data_for_analysis, aes(x = continent, y = air_quality_Sulphur_dioxide, fill = continent)) +
geom_boxplot() +
labs(title = "Sulphur Levels Across Continents", x = "Continent", y = "Sulphur") +
theme_minimal() +
scale_y_continuous(limits = c(0, 30))
## Warning: Removed 136 rows containing non-finite values (`stat_boxplot()`).

Conclusion
Carbon Monoxide: The extremely low p-value (< 2e-16) suggests that
we reject the null hypothesis. There is a significant difference in the
mean levels of Carbon Monoxide across continents.
Ozone: The p-value is less than 0.05, indicating that there are
significant differences in the mean levels of Ozone across continents.
We reject the null hypothesis.
Nitrogen Dioxide: With a very low p-value, the results are
statistically significant, leading to the rejection of the null
hypothesis. There is a significant variation in the mean levels of
Nitrogen Dioxide across continents.
Sulphur Dioxide: The p-value is significantly low, indicating strong
evidence against the null hypothesis. We can conclude that the mean
levels of Sulphur Dioxide significantly differ across continents.
For all four air pollutants (Carbon Monoxide, Ozone, Nitrogen
Dioxide, Sulphur Dioxide), the ANOVA tests reveal significant
differences in mean pollutant levels across continents.
This implies that geographical location, as defined by continents, is
a determining factor in the variation of these air pollutant levels.
5. Identifying the countries affected by pollution levels in
continents
1. By Nitrogen levels in America, Asia, and Europe
# Load necessary libraries
library(dplyr)
library(ggplot2)
# Filter the dataset for relevant continents (America, Asia, Europe)
continents_of_interest <- c('America', 'Asia', 'Europe')
df_filtered <- weather_data %>% filter(continent %in% continents_of_interest)
# Aggregating data for highest Nitrogen levels by country in these continents
df_aggregated <- df_filtered %>%
group_by(continent, country) %>%
summarize(average_nitrogen = mean(air_quality_Nitrogen_dioxide, na.rm = TRUE)) %>%
arrange(desc(average_nitrogen))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
# Selecting the top 5 countries from each continent based on highest Nitrogen levels
top_countries <- df_aggregated %>%
group_by(continent) %>%
top_n(5, average_nitrogen)
# Plotting the data with different color codes for each continent
ggplot(top_countries, aes(x = reorder(country, desc(average_nitrogen)), y = average_nitrogen, fill = continent)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Top 5 Countries by Average Nitrogen Dioxide Levels in America, Asia, and Europe",
x = "Country",
y = "Average Nitrogen Dioxide Level") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_manual(values = c("America" = "blue", "Asia" = "red", "Europe" = "green"))

# Display the plot
ggsave("nitrogen_levels_plot.png", width = 12, height = 8)
Top 5 Countries for Nitrogen Dioxide in America, Asia, and
Europe:
Iran
Ecuador
Malaysia
Qatar
Chile
2. By Carbon Monoxide levels in Africa, America, and Europe
# Load necessary libraries
library(dplyr)
library(ggplot2)
# Filter the dataset for relevant continents (Africa, America, Asia)
continents_of_interest_co <- c('Africa', 'America', 'Asia')
df_filtered_co <- weather_data %>% filter(continent %in% continents_of_interest_co)
# Aggregating data for highest Carbon Monoxide levels by country
df_aggregated_co <- df_filtered_co %>%
group_by(continent, country) %>%
summarize(average_co = mean(air_quality_Carbon_Monoxide, na.rm = TRUE)) %>%
arrange(desc(average_co))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
# Selecting the top 5 countries from each continent based on Carbon Monoxide levels
top_countries_co <- df_aggregated_co %>%
group_by(continent) %>%
top_n(5, average_co)
# Plotting the data with different color codes for each continent
ggplot(top_countries_co, aes(x = reorder(country, desc(average_co)), y = average_co, fill = continent)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Top 5 Countries by Average Carbon Monoxide Levels in Africa, America, and Asia",
x = "Country",
y = "Average Carbon Monoxide Level") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_manual(values = c("Africa" = "orange", "America" = "blue", "Asia" = "red"))

# Display the plot
ggsave("carbon_monoxide_levels_plot.png", width = 12, height = 8)
Top 5 Countries for Carbon Monoxide in Africa, America, and
Asia:
Malaysia
Chile
Indonesia
Ecuador
Ethiopia
3. By Ozone levels in America, Asia, Europe, and Pacific
# Filter the dataset for relevant continents (America, Asia, Europe, Pacific)
continents_of_interest_ozone <- c('America', 'Asia', 'Europe', 'Pacific')
df_filtered_ozone <- weather_data %>% filter(continent %in% continents_of_interest_ozone)
# Aggregating data for highest Ozone levels by country
df_aggregated_ozone <- df_filtered_ozone %>%
group_by(continent, country) %>%
summarize(average_ozone = mean(air_quality_Ozone, na.rm = TRUE)) %>%
arrange(desc(average_ozone))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
# Selecting the top 5 countries from each continent based on Ozone levels
top_countries_ozone <- df_aggregated_ozone %>%
group_by(continent) %>%
top_n(5, average_ozone)
# Plotting the data with different color codes for each continent
ggplot(top_countries_ozone, aes(x = reorder(country, desc(average_ozone)), y = average_ozone, fill = continent)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Top 5 Countries by Average Ozone Levels in America, Asia, Europe, and the Pacific",
x = "Country",
y = "Average Ozone Level") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_manual(values = c("America" = "blue", "Asia" = "red", "Europe" = "green", "Pacific" = "purple"))

# Display the plot
ggsave("ozone_levels_plot.png", width = 12, height = 8)
Top 5 Countries for Ozone in America, Asia, Europe, and
Pacific:
Bahrain
United Arab Emirates
Mexico
Kuwait
Haiti
4. By Sulphur levels in America, Asia
# Load necessary libraries
library(dplyr)
library(ggplot2)
# Filter the dataset for relevant continents (America and Asia)
continents_of_interest_sulphur <- c('America', 'Asia')
df_filtered_sulphur <- weather_data %>% filter(continent %in% continents_of_interest_sulphur)
# Aggregating data for highest Sulphur levels by country
df_aggregated_sulphur <- df_filtered_sulphur %>%
group_by(continent, country) %>%
summarize(average_sulphur = mean(air_quality_Sulphur_dioxide, na.rm = TRUE)) %>%
arrange(desc(average_sulphur))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
# Selecting the top 5 countries from each continent based on Sulphur levels
top_countries_sulphur <- df_aggregated_sulphur %>%
group_by(continent) %>%
top_n(5, average_sulphur)
# Plotting the data with different color codes for each continent
ggplot(top_countries_sulphur, aes(x = reorder(country, desc(average_sulphur)), y = average_sulphur, fill = continent)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Top 5 Countries by Average Sulphur Dioxide Levels in America and Asia",
x = "Country",
y = "Average Sulphur Dioxide Level") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_manual(values = c("America" = "blue", "Asia" = "red"))

# Display the plot
ggsave("sulphur_levels_plot.png", width = 12, height = 8)
Top 5 Countries for Ozone in America, Asia, Europe, and
Pacific:
Iran
China
Malaysia
Qatar
Venezuela