DATA607 Project 2

Dataset 2:Weather Data (by Institution): Average Temperature, Days of Precipitation, and Sunny Days

Analysis: To acquire the data, tidy, clean, transform and visualize the data

Conclusion: 1.I observed that few of the institute like Ramapo College of New Jersey does not have a Avg temperature for few months.Hence had to use na.rm=true to remove them. 2.when visualized the avg mean temperature, through histogram its is multimodal distribution and is not symmetric. 3. i see a negative correlation between the two.lesser the precipitation, more are the sunny days.

library(dplyr,warn.conflicts = FALSE)
library(tidyr,warn.conflicts = FALSE)
library(readr,warn.conflicts = FALSE)
library(ggplot2)
#i tried to scrape the data from the table displayed in the website, but i was not successful.

#read the csv file on Climate_Indicators_Annual_Mean_Global_Surface_Temperature
# i used file encoding to get the special character ° in the data.
raw <- read.csv('https://raw.githubusercontent.com/datanerddhanya/DATA607/main/Institutions_Weather_Data.csv', sep = ",", fileEncoding="latin1")

#Extract only the average high temperature for calculation purposes.
 raw <- raw |> 
 separate_wider_delim(Avg.Jan.Temp, delim = "°", names = c("Avg.Jan.Temp", "other"), too_many="drop") |>
 separate_wider_delim(Avg.April.Temp, delim = "°", names = c("Avg.April.Temp", "other1"), too_many="drop") |>
 separate_wider_delim(Avg.July.Temp, delim = "°", names = c("Avg.July.Temp", "other2"), too_many="drop") |>
 separate_wider_delim(Avg.Oct.Temp, delim = "°", names = c("Avg.Oct.Temp", "other3"), too_many="drop")

 # select the columns needed
 weather_data <- raw |> 
   select(!starts_with("other"))
 
 
 # convert from character to numeric
 weather_data$Avg.Jan.Temp = as.numeric(weather_data$Avg.Jan.Temp)
  weather_data$Avg.April.Temp = as.numeric(weather_data$Avg.April.Temp)

## Warning: NAs introduced by coercion

   weather_data$Avg.July.Temp = as.numeric(weather_data$Avg.July.Temp)

## Warning: NAs introduced by coercion

    weather_data$Avg.Oct.Temp = as.numeric(weather_data$Avg.Oct.Temp)

## Warning: NAs introduced by coercion

# now we can sort the temperatures, find the mean , generate new columns etc
# observed that one of the institute: Ramapo College of New Jersey does not have a temperature for april. Hence removing that row  
 head(weather_data)

## # A tibble: 6 × 9
##   Institution City  State Avg.Jan.Temp Avg.April.Temp Avg.July.Temp Avg.Oct.Temp
##   <chr>       <chr> <chr>        <dbl>          <dbl>         <dbl>        <dbl>
## 1 Simmons Un… Bost… MA              36             57            82           62
## 2 Muhlenberg… Alle… PA              38             62            86           65
## 3 Massachuse… Buzz… MA              38             55            80           62
## 4 Manhattan … Rive… NY              38             63            85           65
## 5 University… Dulu… MN              21             48            75           52
## 6 Northern M… Marq… MI              24             48            76           54
## # ℹ 2 more variables: Days.w.Precipitation <dbl>, Sunny.Days <int>

 weather_data_stat <- weather_data |> 
   rowwise() |> 
  mutate( avg.temp = mean(c(Avg.Jan.Temp,Avg.April.Temp,Avg.July.Temp,Avg.Oct.Temp),na.rm = TRUE))
 
 #summarise data by state
 
 weather_data_stat_summary <- weather_data_stat %>%
 group_by(State) %>%
#summarise(across(everything(), list(mean))) %>%
  summarise( avg.state.temp = mean(avg.temp),avg.state.prec = mean(Days.w.Precipitation), avg.state.sunnydays = mean(Sunny.Days))
 
 
 # when visualized through historgram its is multimodal distrbution and is not symmetric.
  statemean = mean(weather_data_stat_summary$avg.state.temp)
  statesd = sd(weather_data_stat_summary$avg.state.temp)
 ggplot(weather_data_stat_summary, aes(avg.state.temp))+
   geom_histogram() +
    stat_function(fun = dnorm, args = c(mean = statemean , sd = statesd), col = "tomato")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 #checking if there is any correlation between precipitation and sunny days
 # i see a negative correlation between the two.lesser the precipation, more are the sunny days.
 ggplot(weather_data_stat_summary, aes(avg.state.sunnydays,avg.state.prec))+
   geom_point()

## Warning: Removed 1 rows containing missing values (`geom_point()`).

DATA607 Project 2

Dhanya Nair

2024-03-02