getwd()
## [1] "C:/Users/maria/OneDrive/Documents"
#Install the packages
#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages("viridisLite")
#install.packages("randomForest")
#install.packages("caret")
#install.packages("e1071")
#install.packages("GGally")
#install.packages("shiny")
#Load the libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(viridisLite)
## Warning: package 'viridisLite' was built under R version 4.3.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
library(GGally)
## Warning: package 'GGally' was built under R version 4.3.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
#Load the data
data <- read.csv("Final_Project.csv")
#Display the data
head(data)
#Let's take a look at the data
str(data)
## 'data.frame': 32190 obs. of 15 variables:
## $ WHO.Region : chr "Eastern Mediterranean Region" "European Region" "European Region" "European Region" ...
## $ ISO3 : chr "AFG" "ALB" "ALB" "ALB" ...
## $ WHO.Country.Name : chr "Afghanistan" "Albania" "Albania" "Albania" ...
## $ City.or.Locality : chr "Kabul" "Durres" "Durres" "Elbasan" ...
## $ Measurement.Year : int 2019 2015 2016 2015 2016 2017 2015 2016 2014 2015 ...
## $ PM2.5...g.m3. : num 119.8 NA 14.3 NA NA ...
## $ PM10...g.m3. : num NA 17.6 24.6 NA NA ...
## $ NO2...g.m3. : num NA 26.6 24.8 24 26.3 ...
## $ PM25.temporal.coverage.... : num 18 NA NA NA NA NA NA NA NA NA ...
## $ PM10.temporal.coverage.... : num NA NA NA NA NA NA NA NA NA NA ...
## $ NO2.temporal.coverage.... : num NA 84 87.9 97.9 96 ...
## $ Reference : chr "U.S. Department of State, United States Environmental Protection Agency" "European Environment Agency (downloaded in 2021)" "European Environment Agency (downloaded in 2021)" "European Environment Agency (downloaded in 2021)" ...
## $ Number.and.type.of.monitoring.stations: chr NA NA NA NA ...
## $ Version.of.the.database : int 2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
## $ Status : logi NA NA NA NA NA NA ...
1. Clean the data
#First let's look at the column names
names(data)
## [1] "WHO.Region"
## [2] "ISO3"
## [3] "WHO.Country.Name"
## [4] "City.or.Locality"
## [5] "Measurement.Year"
## [6] "PM2.5...g.m3."
## [7] "PM10...g.m3."
## [8] "NO2...g.m3."
## [9] "PM25.temporal.coverage...."
## [10] "PM10.temporal.coverage...."
## [11] "NO2.temporal.coverage...."
## [12] "Reference"
## [13] "Number.and.type.of.monitoring.stations"
## [14] "Version.of.the.database"
## [15] "Status"
#Select only the columns that will be used in the analysis
data <- select(data, "ISO3", "WHO.Country.Name", "City.or.Locality", "Measurement.Year", "PM2.5...g.m3.", "PM10...g.m3.")
#Display the new dataset
head(data)
#Change the column names
names(data)[1:6] <- c("Code", "Country", "City_Locality", "Year", "PM_2.5", "PM_10")
#Display the data
head(data)
Since the analysis focuses on PM 2.5 AND PM 10 we would need to drop those rows without these values
#Drop the rows with an NA value for PM 2.5 or PM 10
data <- data[complete.cases(data$PM_2.5, data$PM_10), ]
#Display the dataset
head(data)
#Let's see the new dataset after dropping the NA values
str(data)
## 'data.frame': 8824 obs. of 6 variables:
## $ Code : chr "ALB" "ALB" "ALB" "ALB" ...
## $ Country : chr "Albania" "Albania" "Albania" "Albania" ...
## $ City_Locality: chr "Durres" "Korce" "Korce" "Vrith" ...
## $ Year : int 2016 2015 2016 2015 2017 2018 2019 2018 2019 2015 ...
## $ PM_2.5 : num 14.3 30.3 28.6 13.2 32.8 ...
## $ PM_10 : num 24.6 45.3 40.2 19.5 122.2 ...
#View a summary of the dataset
summary(data)
## Code Country City_Locality Year
## Length:8824 Length:8824 Length:8824 Min. :2010
## Class :character Class :character Class :character 1st Qu.:2014
## Mode :character Mode :character Mode :character Median :2016
## Mean :2016
## 3rd Qu.:2018
## Max. :2021
## PM_2.5 PM_10
## Min. : 0.010 Min. : 2.17
## 1st Qu.: 9.818 1st Qu.: 17.00
## Median : 13.360 Median : 21.85
## Mean : 16.479 Mean : 29.34
## 3rd Qu.: 19.310 3rd Qu.: 30.48
## Max. :191.900 Max. :540.00
2. Analyze the data
#Look for the countries with the highest values for PM 2.5 recorded
top_5_pm_2.5 <- head(data[order(-data$PM_2.5), ], 5)
#Print the rows
top_5_pm_2.5
#Look for the countries with the highest values for PM 10 recorded
top_5_pm_10 <- head(data[order(-data$PM_10), ], 5)
#Print the row
top_5_pm_10
#Now let's see the countries with the lowest levels of PM 10 recorded
min_pm_2.5 <- head(data[order(data$PM_2.5), ], 5)
#Print the row
min_pm_2.5
#Now let's see the countries with the lowest levels of PM 10 recorded
min_pm_10 <- head(data[order(data$PM_10), ], 5)
#Print the row
min_pm_10
3. Create visualizations: Histograms
#Create a histogram for the PM_2.5
ggplot(data, aes(x= PM_2.5)) +
geom_histogram(binwidth = 15, fill = "blue", color = "white") +
labs(title = "World Wide Levels of PM 2.5", x = "PM 2.5 Levels", y = "Intensity")
#Create a histogram for the PM_10
ggplot(data, aes(x= PM_10)) +
geom_histogram(binwidth = 15, fill = "black", color = "white") +
labs(title = "World Wide Levels of PM 10", x = "PM 10 Levels", y = "Intensity")
4. Create a subset to focus on the data for USA only
#Create a subset that will contain only the information from USA
usa <- subset(data, Code == "USA")
usa
#Look for the cities with the highest values of PM 2.5 recorded
top_usa_pm_2.5 <- head(usa[order(-usa$PM_2.5), ], 5)
#Print the rows
top_usa_pm_2.5
#Look for the cities with the lowest values of PM 2.5 recorded
w_usa_pm_2.5 <- head(usa[order(usa$PM_2.5), ], 5)
#Print the rows
w_usa_pm_2.5
#Look for the cities with the highest values for PM 10 recorded
top_usa_pm_10 <- head(usa[order(-usa$PM_10), ], 5)
#Print the rows
top_usa_pm_10
#Look for the cities with the lowest values for PM 10 recorded
w_usa_pm_10 <- head(usa[order(usa$PM_10), ], 5)
#Print the row
w_usa_pm_10
5. Create visualizations: Box Plots
#Create a box plot to display the levels of PM 2.5 in the US
boxplot_2.5 <- ggplot(usa, aes(x = factor(Year), y = PM_2.5, fill = factor(Year))) +
geom_boxplot() + # Create box plots
scale_fill_viridis_d(option = "C", end = 0.9) + # Use Viridis color palette
labs(title = "PM 2.5 Levels In the US", x = "Years", y = "Intensity") +
theme_minimal() + # Use a minimal theme
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels
boxplot_2.5 <- boxplot_2.5 +
labs(fill = "Years")
#Display the graph
print(boxplot_2.5)
#Create a box plot to display the levels PM 10 in the US
boxplot_10 <- ggplot(usa, aes(x = factor(Year), y = PM_10, fill = factor(Year))) +
geom_boxplot() + # Create box plots
scale_fill_viridis_d(option = "D", end = 0.9) + # Use Viridis color palette
labs(title = "PM 10 Levels In the US", x = "Years", y = "Intensity") +
theme_minimal() + # Use a minimal theme
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels
boxplot_10 <- boxplot_10 +
labs(fill = "Years")
#Display the graph
print(boxplot_10)
6. Find a correlation between PM 2.5 and PM 10
#Find a correlation between PM 2.5 and PM 10
correlation <- cor(usa$PM_2.5, usa$PM_10)
print(correlation)
## [1] 0.5241959
Strength of the Relationship: A correlation coefficient of 0.5 suggests a moderate degree of linear relationship between the two variables. As one variable increases, the other tends to increase as well, but the relationship is not perfect.
Direction of the Relationship: Since the correlation coefficient is positive, it indicates a positive association between the variables. This means that higher values of one variable tend to be associated with higher values of the other variable.
#Create a graph to represent the correlation between the variables
pollution_pairs <- data.frame(usa$PM_2.5, usa$PM_10)
pollutants <- c("PM 2.5", "PM 10")
ggpairs(pollution_pairs, lower.panel = NULL)
## Warning in warn_if_args_exist(list(...)): Extra arguments: "lower.panel" are
## being ignored. If these are meant to be aesthetics, submit them using the
## 'mapping' variable within ggpairs with ggplot2::aes or ggplot2::aes_string.
7. Create a Machine Learning Model: Linear regression
#We will use linear regression to try to predict PM 2.5 based on the levels of PM 10
lm_2.5 <- lm(PM_2.5 ~ PM_10, data = usa)
summary(lm_2.5)
##
## Call:
## lm(formula = PM_2.5 ~ PM_10, data = usa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.0935 -2.4897 0.1401 2.3045 11.6003
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.43561 0.43761 10.14 <2e-16 ***
## PM_10 0.20578 0.01797 11.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.139 on 346 degrees of freedom
## Multiple R-squared: 0.2748, Adjusted R-squared: 0.2727
## F-statistic: 131.1 on 1 and 346 DF, p-value: < 2.2e-16
Residual Standard Error (RSE = 3.139): A lower RSE indicates that the model’s predictions are closer to the observed values on average. However, what constitutes a “good” RSE depends on the scale of the dependent variable. Because the units that we are trying to analyze are in a scale from 1-12, the RSE would be considered to “high” for it to predict the values of PM 2.5 based on PM 10
Multiple R-squared and Adjusted R-squared (0.2748): These metrics measure the proportion of variance in the dependent variable that is explained by the independent variables. A higher R-squared value indicates that a larger proportion of the variability in the dependent variable (PM 2.5). In our case a variability of 0.2748 would be considered acceptable for the units that we use in our scale
F-statistic and p-value: The F-statistic tests the overall significance of the regression model, while the associated p-value indicates the probability of observing the data if the null hypothesis (that all regression coefficients are zero) were true. A low p-value (typically less than 0.05) suggests that the model is statistically significant. In this case, the F-statistic is high, and the p-value is very low, indicating that the model is statistically significant.
#Create predictions
#Let's test the model
new_data <- read.csv("PM25National.csv")
preds_2.5 <- predict(lm_2.5, new_data)
preds_2.5 <- data.frame(preds_2.5) # To convert to a data frame
#Display the predictions
head(preds_2.5)
#To compare the predictions to some actual values
comparison <- data.frame(c(data.frame(head(pollution_pairs$usa.PM_2.5)), head(preds_2.5)))
#Create an extrac column to show the difference between the predictions and the actual values
comparison$Difference <- comparison$preds_2.5 - comparison$head.pollution_pairs.usa.PM_2.5.
#Change the column names
names(comparison) <- c("Actual Values PM 2.5", "Predictions PM 2.5", "Difference")
#View the data
comparison