getwd()
## [1] "C:/Users/maria/OneDrive/Documents"
#Install the packages
#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages("viridisLite")
#install.packages("randomForest")
#install.packages("caret")
#install.packages("e1071")
#install.packages("GGally")
#install.packages("shiny")
#Load the libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(viridisLite)
## Warning: package 'viridisLite' was built under R version 4.3.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
library(GGally)
## Warning: package 'GGally' was built under R version 4.3.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
#Load the data
data <- read.csv("Final_Project.csv")

#Display the data
head(data)
#Let's take a look at the data
str(data)
## 'data.frame':    32190 obs. of  15 variables:
##  $ WHO.Region                            : chr  "Eastern Mediterranean Region" "European Region" "European Region" "European Region" ...
##  $ ISO3                                  : chr  "AFG" "ALB" "ALB" "ALB" ...
##  $ WHO.Country.Name                      : chr  "Afghanistan" "Albania" "Albania" "Albania" ...
##  $ City.or.Locality                      : chr  "Kabul" "Durres" "Durres" "Elbasan" ...
##  $ Measurement.Year                      : int  2019 2015 2016 2015 2016 2017 2015 2016 2014 2015 ...
##  $ PM2.5...g.m3.                         : num  119.8 NA 14.3 NA NA ...
##  $ PM10...g.m3.                          : num  NA 17.6 24.6 NA NA ...
##  $ NO2...g.m3.                           : num  NA 26.6 24.8 24 26.3 ...
##  $ PM25.temporal.coverage....            : num  18 NA NA NA NA NA NA NA NA NA ...
##  $ PM10.temporal.coverage....            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NO2.temporal.coverage....             : num  NA 84 87.9 97.9 96 ...
##  $ Reference                             : chr  "U.S. Department of State, United States Environmental Protection Agency" "European Environment Agency (downloaded in 2021)" "European Environment Agency (downloaded in 2021)" "European Environment Agency (downloaded in 2021)" ...
##  $ Number.and.type.of.monitoring.stations: chr  NA NA NA NA ...
##  $ Version.of.the.database               : int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ Status                                : logi  NA NA NA NA NA NA ...

1. Clean the data

#First let's look at the column names
names(data)
##  [1] "WHO.Region"                            
##  [2] "ISO3"                                  
##  [3] "WHO.Country.Name"                      
##  [4] "City.or.Locality"                      
##  [5] "Measurement.Year"                      
##  [6] "PM2.5...g.m3."                         
##  [7] "PM10...g.m3."                          
##  [8] "NO2...g.m3."                           
##  [9] "PM25.temporal.coverage...."            
## [10] "PM10.temporal.coverage...."            
## [11] "NO2.temporal.coverage...."             
## [12] "Reference"                             
## [13] "Number.and.type.of.monitoring.stations"
## [14] "Version.of.the.database"               
## [15] "Status"
#Select only the columns that will be used in the analysis
data <- select(data, "ISO3", "WHO.Country.Name", "City.or.Locality", "Measurement.Year", "PM2.5...g.m3.", "PM10...g.m3.")

#Display the new dataset
head(data)
#Change the column names
names(data)[1:6] <- c("Code", "Country", "City_Locality", "Year", "PM_2.5", "PM_10")

#Display the data
head(data)

Since the analysis focuses on PM 2.5 AND PM 10 we would need to drop those rows without these values

#Drop the rows with an NA value for PM 2.5 or PM 10
data <- data[complete.cases(data$PM_2.5, data$PM_10), ]

#Display the dataset
head(data)
#Let's see the new dataset after dropping the NA values
str(data)
## 'data.frame':    8824 obs. of  6 variables:
##  $ Code         : chr  "ALB" "ALB" "ALB" "ALB" ...
##  $ Country      : chr  "Albania" "Albania" "Albania" "Albania" ...
##  $ City_Locality: chr  "Durres" "Korce" "Korce" "Vrith" ...
##  $ Year         : int  2016 2015 2016 2015 2017 2018 2019 2018 2019 2015 ...
##  $ PM_2.5       : num  14.3 30.3 28.6 13.2 32.8 ...
##  $ PM_10        : num  24.6 45.3 40.2 19.5 122.2 ...
#View a summary of the dataset
summary(data)
##      Code             Country          City_Locality           Year     
##  Length:8824        Length:8824        Length:8824        Min.   :2010  
##  Class :character   Class :character   Class :character   1st Qu.:2014  
##  Mode  :character   Mode  :character   Mode  :character   Median :2016  
##                                                           Mean   :2016  
##                                                           3rd Qu.:2018  
##                                                           Max.   :2021  
##      PM_2.5            PM_10       
##  Min.   :  0.010   Min.   :  2.17  
##  1st Qu.:  9.818   1st Qu.: 17.00  
##  Median : 13.360   Median : 21.85  
##  Mean   : 16.479   Mean   : 29.34  
##  3rd Qu.: 19.310   3rd Qu.: 30.48  
##  Max.   :191.900   Max.   :540.00

2. Analyze the data

#Look for the countries with the highest values for PM 2.5 recorded
top_5_pm_2.5 <- head(data[order(-data$PM_2.5), ], 5)

#Print the rows
top_5_pm_2.5
#Look for the countries with the highest values for PM 10 recorded
top_5_pm_10 <- head(data[order(-data$PM_10), ], 5)

#Print the row
top_5_pm_10
#Now let's see the countries with the lowest levels of PM 10 recorded
min_pm_2.5 <- head(data[order(data$PM_2.5), ], 5)

#Print the row
min_pm_2.5
#Now let's see the countries with the lowest levels of PM 10 recorded
min_pm_10 <- head(data[order(data$PM_10), ], 5)

#Print the row
min_pm_10

3. Create visualizations: Histograms

#Create a histogram for the PM_2.5
ggplot(data, aes(x= PM_2.5)) +
  geom_histogram(binwidth = 15, fill = "blue", color = "white") +
  labs(title = "World Wide Levels of PM 2.5", x = "PM 2.5 Levels", y = "Intensity")

#Create a histogram for the PM_10
ggplot(data, aes(x= PM_10)) +
  geom_histogram(binwidth = 15, fill = "black", color = "white") +
  labs(title = "World Wide Levels of PM 10", x = "PM 10 Levels", y = "Intensity")

4. Create a subset to focus on the data for USA only

#Create a subset that will contain only the information from USA
usa <- subset(data, Code == "USA")

usa
#Look for the cities with the highest values of PM 2.5 recorded
top_usa_pm_2.5 <- head(usa[order(-usa$PM_2.5), ], 5)

#Print the rows
top_usa_pm_2.5
#Look for the cities with the lowest values of PM 2.5 recorded
w_usa_pm_2.5 <- head(usa[order(usa$PM_2.5), ], 5)

#Print the rows
w_usa_pm_2.5
#Look for the cities with the highest values for PM 10 recorded
top_usa_pm_10 <- head(usa[order(-usa$PM_10), ], 5)

#Print the rows
top_usa_pm_10
#Look for the cities with the lowest values for PM 10 recorded
w_usa_pm_10 <- head(usa[order(usa$PM_10), ], 5)

#Print the row
w_usa_pm_10

5. Create visualizations: Box Plots

#Create a box plot to display the levels of PM 2.5 in the US

boxplot_2.5 <- ggplot(usa, aes(x = factor(Year), y = PM_2.5, fill = factor(Year))) +
  geom_boxplot() +  # Create box plots
  scale_fill_viridis_d(option = "C", end = 0.9) +  # Use Viridis color palette
  labs(title = "PM 2.5 Levels In the US", x = "Years", y = "Intensity") +
  theme_minimal() +  # Use a minimal theme
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels

boxplot_2.5 <- boxplot_2.5 +
  labs(fill = "Years")

#Display the graph
print(boxplot_2.5)

#Create a box plot to display the levels PM 10 in the US

boxplot_10 <- ggplot(usa, aes(x = factor(Year), y = PM_10, fill = factor(Year))) +
  geom_boxplot() +  # Create box plots
  scale_fill_viridis_d(option = "D", end = 0.9) +  # Use Viridis color palette
  labs(title = "PM 10 Levels In the US", x = "Years", y = "Intensity") +
  theme_minimal() +  # Use a minimal theme
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels

boxplot_10 <- boxplot_10 +
  labs(fill = "Years")

#Display the graph
print(boxplot_10)

6. Find a correlation between PM 2.5 and PM 10

#Find a correlation between PM 2.5 and PM 10
correlation <- cor(usa$PM_2.5, usa$PM_10)

print(correlation)
## [1] 0.5241959

Strength of the Relationship: A correlation coefficient of 0.5 suggests a moderate degree of linear relationship between the two variables. As one variable increases, the other tends to increase as well, but the relationship is not perfect.

Direction of the Relationship: Since the correlation coefficient is positive, it indicates a positive association between the variables. This means that higher values of one variable tend to be associated with higher values of the other variable.

#Create a graph to represent the correlation between the variables
pollution_pairs <- data.frame(usa$PM_2.5, usa$PM_10)

pollutants <- c("PM 2.5", "PM 10")
ggpairs(pollution_pairs, lower.panel = NULL)
## Warning in warn_if_args_exist(list(...)): Extra arguments: "lower.panel" are
## being ignored.  If these are meant to be aesthetics, submit them using the
## 'mapping' variable within ggpairs with ggplot2::aes or ggplot2::aes_string.

7. Create a Machine Learning Model: Linear regression

#We will use linear regression to try to predict PM 2.5 based on the levels of PM 10
lm_2.5 <- lm(PM_2.5 ~ PM_10, data = usa)
summary(lm_2.5)
## 
## Call:
## lm(formula = PM_2.5 ~ PM_10, data = usa)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.0935 -2.4897  0.1401  2.3045 11.6003 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.43561    0.43761   10.14   <2e-16 ***
## PM_10        0.20578    0.01797   11.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.139 on 346 degrees of freedom
## Multiple R-squared:  0.2748, Adjusted R-squared:  0.2727 
## F-statistic: 131.1 on 1 and 346 DF,  p-value: < 2.2e-16

Residual Standard Error (RSE = 3.139): A lower RSE indicates that the model’s predictions are closer to the observed values on average. However, what constitutes a “good” RSE depends on the scale of the dependent variable. Because the units that we are trying to analyze are in a scale from 1-12, the RSE would be considered to “high” for it to predict the values of PM 2.5 based on PM 10

Multiple R-squared and Adjusted R-squared (0.2748): These metrics measure the proportion of variance in the dependent variable that is explained by the independent variables. A higher R-squared value indicates that a larger proportion of the variability in the dependent variable (PM 2.5). In our case a variability of 0.2748 would be considered acceptable for the units that we use in our scale

F-statistic and p-value: The F-statistic tests the overall significance of the regression model, while the associated p-value indicates the probability of observing the data if the null hypothesis (that all regression coefficients are zero) were true. A low p-value (typically less than 0.05) suggests that the model is statistically significant. In this case, the F-statistic is high, and the p-value is very low, indicating that the model is statistically significant.

#Create predictions

#Let's test the model
new_data <- read.csv("PM25National.csv")
preds_2.5 <- predict(lm_2.5, new_data)
preds_2.5 <- data.frame(preds_2.5) # To convert to a data frame

#Display the predictions
head(preds_2.5)
#To compare the predictions to some actual values
comparison <- data.frame(c(data.frame(head(pollution_pairs$usa.PM_2.5)), head(preds_2.5)))

#Create an extrac column to show the difference between the predictions and the actual values
comparison$Difference <- comparison$preds_2.5 - comparison$head.pollution_pairs.usa.PM_2.5.

#Change the column names
names(comparison) <- c("Actual Values PM 2.5", "Predictions PM 2.5", "Difference")

#View the data
comparison