DATA607 Project 2

Dataset 1: Annual estimates of mean surface temperature change measured with respect to a baseline climatology

Analysis: i cleaned the data, transformed and calculated the mean statistic.

Conclusion: Based on this analysis, Luxemberg ,Estonia, Serbia,Belgium and latvia countries may be experiencing a more pronounced impact in terms of temperature change.

library(dplyr,warn.conflicts = FALSE)
library(tidyr,warn.conflicts = FALSE)
library(readr,warn.conflicts = FALSE)


#read the csv file on Climate_Indicators_Annual_Mean_Global_Surface_Temperature
load <- read.csv('https://raw.githubusercontent.com/datanerddhanya/DATA607/main/Indicator_3_1_Climate_Indicators_Annual_Mean_Global_Surface_Temperature_577579683071085080.csv',sep = ",", stringsAsFactors=FALSE, quote = "\"")

#parse the data into a dataframe
#load<- data.frame(load)

# the data is wide as teh years are in the columns. i would like to tidy the data by moving it into #rows.
load_pivot <- load  %>%
  pivot_longer(
     cols = !(ObjectId:CTS.Full.Descriptor) ,
     names_to = "Year", 
     values_to = "Mean.Surface.Temp.change" ,
    values_drop_na = TRUE
   ) 

#need to split the Year column to remove the X value in it 
load_clean <- load_pivot |> 
   separate_wider_delim(Year, delim = "X", names = c("Blank", "Year"))
  
# cleaning up Country column to remove text after comma
  extract_value <- function(x) {
  if (grepl(",", x)) {
    split_values <- strsplit(x, ",")[[1]]
    return(trimws(split_values[1]))
  } else {
    return(trimws(x))
  }
  }

# Apply the function to the entire column
load_clean$Country <- sapply(load_clean$Country, extract_value)


 
#Create an new data frame with the required columns
load_final <- load_clean[,c('ObjectId','Country','ISO3','Unit','Year','Mean.Surface.Temp.change')]
# transform(load_final, Mean.Surface.Temp.change = as.numeric(Mean.Surface.Temp.change))
# load_final$Mean.Surface.Temp.change = as.numeric(load_final$Mean.Surface.Temp.change)


#Calculate average climate change over all the years for a country
load_climate_change <- load_final %>%
 group_by(Country) %>%
#summarise(across(everything(), list(mean))) %>%
  summarise( Mean.Surface.Temp.change = mean(Mean.Surface.Temp.change)) %>%
#  summarise(.funs = mean,na.rm=T)          
arrange(desc(Mean.Surface.Temp.change))

head(load_climate_change)

## # A tibble: 6 × 2
##   Country    Mean.Surface.Temp.change
##   <chr>                         <dbl>
## 1 Luxembourg                     1.58
## 2 Estonia                        1.56
## 3 Serbia                         1.54
## 4 Belgium                        1.53
## 5 Latvia                         1.51
## 6 Belarus                        1.51

# calculate overall center and spread
mean = mean(load_climate_change$Mean.Surface.Temp.change)
sd= sd(load_climate_change$Mean.Surface.Temp.change)

#mean
head(mean)

## [1] 0.5820715

#standard deviation
head(sd)

## [1] 0.3223647

Dataset 2:Weather Data (by Institution): Average Temperature, Days of Precipitation, and Sunny Days

Analysis: To acquire the data, tidy, clean, transform and visualize the data

Conclusion: 1.I observed that few of the institute like Ramapo College of New Jersey does not have a Avg temperature for few months.Hence had to use na.rm=true to remove them. 2.when visualized the avg mean temperature, through histogram its is multimodal distribution and is not symmetric. 3. i see a negative correlation between the two.lesser the precipitation, more are the sunny days.

library(dplyr,warn.conflicts = FALSE)
library(tidyr,warn.conflicts = FALSE)
library(readr,warn.conflicts = FALSE)
library(ggplot2)
#i tried to scrape the data from the table displayed in the website, but i was not successful.

#read the csv file on Climate_Indicators_Annual_Mean_Global_Surface_Temperature
# i used file encoding to get the special character ° in the data.
raw <- read.csv('https://raw.githubusercontent.com/datanerddhanya/DATA607/main/Institutions_Weather_Data.csv', sep = ",", fileEncoding="latin1")

#Extract only the average high temperature for calculation purposes.
 raw <- raw |> 
 separate_wider_delim(Avg.Jan.Temp, delim = "°", names = c("Avg.Jan.Temp", "other"), too_many="drop") |>
 separate_wider_delim(Avg.April.Temp, delim = "°", names = c("Avg.April.Temp", "other1"), too_many="drop") |>
 separate_wider_delim(Avg.July.Temp, delim = "°", names = c("Avg.July.Temp", "other2"), too_many="drop") |>
 separate_wider_delim(Avg.Oct.Temp, delim = "°", names = c("Avg.Oct.Temp", "other3"), too_many="drop")

 # select the columns needed
 weather_data <- raw |> 
   select(!starts_with("other"))
 
 
 # convert from character to numeric
 weather_data$Avg.Jan.Temp = as.numeric(weather_data$Avg.Jan.Temp)
  weather_data$Avg.April.Temp = as.numeric(weather_data$Avg.April.Temp)

## Warning: NAs introduced by coercion

   weather_data$Avg.July.Temp = as.numeric(weather_data$Avg.July.Temp)

## Warning: NAs introduced by coercion

    weather_data$Avg.Oct.Temp = as.numeric(weather_data$Avg.Oct.Temp)

## Warning: NAs introduced by coercion

# now we can sort the temperatures, find the mean , generate new columns etc
# observed that one of the institute: Ramapo College of New Jersey does not have a temperature for april. Hence removing that row  
 head(weather_data)

## # A tibble: 6 × 9
##   Institution City  State Avg.Jan.Temp Avg.April.Temp Avg.July.Temp Avg.Oct.Temp
##   <chr>       <chr> <chr>        <dbl>          <dbl>         <dbl>        <dbl>
## 1 Simmons Un… Bost… MA              36             57            82           62
## 2 Muhlenberg… Alle… PA              38             62            86           65
## 3 Massachuse… Buzz… MA              38             55            80           62
## 4 Manhattan … Rive… NY              38             63            85           65
## 5 University… Dulu… MN              21             48            75           52
## 6 Northern M… Marq… MI              24             48            76           54
## # ℹ 2 more variables: Days.w.Precipitation <dbl>, Sunny.Days <int>

 weather_data_stat <- weather_data |> 
   rowwise() |> 
  mutate( avg.temp = mean(c(Avg.Jan.Temp,Avg.April.Temp,Avg.July.Temp,Avg.Oct.Temp),na.rm = TRUE))
 
 #summarise data by state
 
 weather_data_stat_summary <- weather_data_stat %>%
 group_by(State) %>%
#summarise(across(everything(), list(mean))) %>%
  summarise( avg.state.temp = mean(avg.temp),avg.state.prec = mean(Days.w.Precipitation), avg.state.sunnydays = mean(Sunny.Days))
 
 
 # when visualized through historgram its is multimodal distrbution and is not symmetric.
  statemean = mean(weather_data_stat_summary$avg.state.temp)
  statesd = sd(weather_data_stat_summary$avg.state.temp)
 ggplot(weather_data_stat_summary, aes(avg.state.temp))+
   geom_histogram() +
    stat_function(fun = dnorm, args = c(mean = statemean , sd = statesd), col = "tomato")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 #checking if there is any correlation between precipitation and sunny days
 # i see a negative correlation between the two.lesser the precipation, more are the sunny days.
 ggplot(weather_data_stat_summary, aes(avg.state.sunnydays,avg.state.prec))+
   geom_point()

## Warning: Removed 1 rows containing missing values (`geom_point()`).

Dataset 3: New York City Air Quality Data

Analysis: Compare air pollutants levels vs. locations vs. time of year.

Conclusion: I can observe here that the pollution levels for Nitrogen dioxide (NO2) and Fine particles (PM 2.5) have decreased over the years.

Midtown(CD5) location has the highest pollution levels for Nitrogen dioxide (NO2) and Fine particles (PM 2.5) for 2021.

library(dplyr,warn.conflicts = FALSE)
library(tidyr,warn.conflicts = FALSE)
library(readr,warn.conflicts = FALSE)
library(ggplot2)

#read the csv file 
pollutant_levels <- read.csv('https://raw.githubusercontent.com/datanerddhanya/DATA607/main/Air_Quality_20240302.csv', fileEncoding="latin1")

#select only the Annual Report time period category for analysis in Time.Period column
pollutant_levels_summary <- pollutant_levels[grep("Annual",pollutant_levels$Time.Period),] 

# cleanup Time.period variable to remove "Annual Report" and convert to numeric
pollutant_levels_summary$Time.Period <- gsub("Annual Average ", " ",pollutant_levels_summary$Time.Period) 


#Summarize to Pollutant and Year
pollutant_levels_name <- pollutant_levels_summary  |>
  group_by(Name,Time.Period) |>
  summarise(mean = mean(Data.Value))

## `summarise()` has grouped output by 'Name'. You can override using the
## `.groups` argument.

#Summarize to Location and Year
pollutant_levels_location <- pollutant_levels_summary  |>
  group_by(Geo.Place.Name, Time.Period) |>
  summarise(mean = mean(Data.Value))

## `summarise()` has grouped output by 'Geo.Place.Name'. You can override using
## the `.groups` argument.

#Summarize to Location and Year
pollutant_levels_name_location <- pollutant_levels_summary  |>
  group_by(Name,Geo.Place.Name, Time.Period) |>
  summarise(mean = mean(Data.Value))

## `summarise()` has grouped output by 'Name', 'Geo.Place.Name'. You can override
## using the `.groups` argument.

#Visualise using plots
# I can observe here that the pollution levels for Nitrogen dioxide (NO2) and Fine particles (PM 2.5) have decreased over the years.

ggplot(data =pollutant_levels_name, aes(Time.Period,mean) )+
  geom_col(aes(colour=Name))

#which city has the highest pollution levels of Nitrogen dioxide (NO2) and Fine particles (PM 2.5)
#for the year 2021

pollutant_levels_name_location[grepl("2021",pollutant_levels_name_location$Time.Period),] |>

arrange(desc(mean))

## # A tibble: 228 × 4
## # Groups:   Name, Geo.Place.Name [228]
##    Name                   Geo.Place.Name                       Time.Period  mean
##    <chr>                  <chr>                                <chr>       <dbl>
##  1 Nitrogen dioxide (NO2) Midtown (CD5)                        " 2021"      25.2
##  2 Nitrogen dioxide (NO2) Gramercy Park - Murray Hill          " 2021"      23.5
##  3 Nitrogen dioxide (NO2) Chelsea - Clinton                    " 2021"      23.3
##  4 Nitrogen dioxide (NO2) Chelsea-Village                      " 2021"      22.6
##  5 Nitrogen dioxide (NO2) Clinton and Chelsea (CD4)            " 2021"      22.3
##  6 Nitrogen dioxide (NO2) Stuyvesant Town and Turtle Bay (CD6) " 2021"      22.1
##  7 Nitrogen dioxide (NO2) Upper East Side-Gramercy             " 2021"      21.8
##  8 Nitrogen dioxide (NO2) Financial District (CD1)             " 2021"      21.6
##  9 Nitrogen dioxide (NO2) Lower Manhattan                      " 2021"      21.5
## 10 Nitrogen dioxide (NO2) Greenwich Village - SoHo             " 2021"      21.3
## # ℹ 218 more rows

#Midtown(CD5) has the highest pollution levels for Nitrogen dioxide (NO2) and Fine particles (PM 2.5) for 2021.

DATA607 Project 2

Dhanya Nair

2024-03-02