I would like you to do a small analysis of data from the San Diego International Airport. Get the data from NOAAA and use it to recreate any five graphs, your choice, from our analysis. Don’t forget to clean the data. Submit your analysis as a link to an RPubs document.

Libraries used:

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readr)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Downloaded .csv file from NOAAA (Selected date ranges: 05/15/1941 to 3/13/2022)

# Use readr and change the type of the Date column to “character”. Use glimpse() on sd_airport.
sd_airport <- read_csv("San_Diego_weather.csv", col_types = cols(DATE = col_character()))
glimpse(sd_airport)
## Rows: 29,517
## Columns: 6
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188", "USW00023188", "U~
## $ NAME    <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE    <chr> "1941-05-15", "1941-05-16", "1941-05-17", "1941-05-18", "1941-~
## $ PRCP    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ TMAX    <dbl> 74, 74, 70, 68, 81, 84, 85, 75, 75, 74, 86, 80, 75, 73, 72, 72~
## $ TMIN    <dbl> 58, 61, 60, 59, 54, 56, 58, 60, 63, 61, 60, 60, 58, 62, 61, 60~

Clean the data:

# The character column DATE contains dates in ISO-8601 format. Use as.date() to convert it and run glimpse again.
sd_airport$DATE = as.Date(sd_airport$DATE)
glimpse(sd_airport)
## Rows: 29,517
## Columns: 6
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188", "USW00023188", "U~
## $ NAME    <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE    <date> 1941-05-15, 1941-05-16, 1941-05-17, 1941-05-18, 1941-05-19, 1~
## $ PRCP    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ TMAX    <dbl> 74, 74, 70, 68, 81, 84, 85, 75, 75, 74, 86, 80, 75, 73, 72, 72~
## $ TMIN    <dbl> 58, 61, 60, 59, 54, 56, 58, 60, 63, 61, 60, 60, 58, 62, 61, 60~
# Do a summary() and check for anomalies.
summary(sd_airport)
##    STATION              NAME                DATE                 PRCP       
##  Length:29517       Length:29517       Min.   :1941-05-15   Min.   :0.0000  
##  Class :character   Class :character   1st Qu.:1961-07-31   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Median :1981-10-13   Median :0.0000  
##                                        Mean   :1981-10-12   Mean   :0.0264  
##                                        3rd Qu.:2001-12-26   3rd Qu.:0.0000  
##                                        Max.   :2022-03-10   Max.   :2.7000  
##                                                             NA's   :2       
##       TMAX            TMIN      
##  Min.   : 46.0   Min.   :29.00  
##  1st Qu.: 66.0   1st Qu.:52.00  
##  Median : 70.0   Median :58.00  
##  Mean   : 70.7   Mean   :57.27  
##  3rd Qu.: 75.0   3rd Qu.:63.00  
##  Max.   :111.0   Max.   :78.00  
##  NA's   :3       NA's   :3
# Inspect NA values.
sd_airport %>% filter(is.na(TMAX)) %>% glimpse()  # The NAs came from one day in 1946 and two days in 2022
## Rows: 3
## Columns: 6
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188"
## $ NAME    <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE    <date> 1946-07-17, 2022-03-09, 2022-03-10
## $ PRCP    <dbl> 0, NA, NA
## $ TMAX    <dbl> NA, NA, NA
## $ TMIN    <dbl> 65, NA, NA
sd_airport %>% filter(is.na(TMIN)) %>% glimpse()  # The NAs came from one day in 1946 and two days in 2022 
## Rows: 3
## Columns: 6
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188"
## $ NAME    <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE    <date> 1946-02-23, 2022-03-09, 2022-03-10
## $ PRCP    <dbl> 0, NA, NA
## $ TMAX    <dbl> 64, NA, NA
## $ TMIN    <dbl> NA, NA, NA
sd_airport %>% filter(is.na(PRCP)) %>% glimpse()  # The NAs came from two days in 2022
## Rows: 2
## Columns: 6
## $ STATION <chr> "USW00023188", "USW00023188"
## $ NAME    <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE    <date> 2022-03-09, 2022-03-10
## $ PRCP    <dbl> NA, NA
## $ TMAX    <dbl> NA, NA
## $ TMIN    <dbl> NA, NA
# Drop these records.
sd_airport = sd_airport %>% drop_na()
summary(sd_airport)
##    STATION              NAME                DATE                 PRCP        
##  Length:29513       Length:29513       Min.   :1941-05-15   Min.   :0.00000  
##  Class :character   Class :character   1st Qu.:1961-08-01   1st Qu.:0.00000  
##  Mode  :character   Mode  :character   Median :1981-10-13   Median :0.00000  
##                                        Mean   :1981-10-12   Mean   :0.02641  
##                                        3rd Qu.:2001-12-25   3rd Qu.:0.00000  
##                                        Max.   :2022-03-08   Max.   :2.70000  
##       TMAX            TMIN      
##  Min.   : 46.0   Min.   :29.00  
##  1st Qu.: 66.0   1st Qu.:52.00  
##  Median : 70.0   Median :58.00  
##  Mean   : 70.7   Mean   :57.27  
##  3rd Qu.: 75.0   3rd Qu.:63.00  
##  Max.   :111.0   Max.   :78.00

Density plots:

# Get density with rug plots for TMAX, TMIN and PRCP.
sd_airport %>% 
  ggplot(aes(x = TMAX)) +
  geom_density() +
  geom_rug() +
  ggtitle("TMAX")

sd_airport %>% 
  ggplot(aes(x = TMIN)) +
  geom_density() +
  geom_rug() +
  ggtitle("TMIN")

sd_airport %>% 
  ggplot(aes(x = PRCP)) +
  geom_density() +
  geom_rug() +
  ggtitle("PRCP")

Plot showing when the rainfall has been above and below average since 2010. The z-score 0 is the average rainfall. Positive points above it are more rainfall than average; negative points below it are less rainfall than average.

rain_analysis = sd_airport %>% 
  select(DATE,PRCP) %>% 
  mutate(yr = year(DATE)) %>% 
  filter(yr >= 2010) %>% 
  group_by(yr) %>% 
  summarize(rain = sum(PRCP)) %>% 
  ungroup() %>% 
  mutate(mrain = mean(rain),
         sdrain = sd(rain),
         z_score = (rain - mean(rain))/sdrain) 

head(rain_analysis)
## # A tibble: 6 x 5
##      yr  rain mrain sdrain z_score
##   <dbl> <dbl> <dbl>  <dbl>   <dbl>
## 1  2010 16.3   8.73   3.81  1.98  
## 2  2011  9.08  8.73   3.81  0.0924
## 3  2012  6.64  8.73   3.81 -0.548 
## 4  2013  5.57  8.73   3.81 -0.828 
## 5  2014  7.77  8.73   3.81 -0.251 
## 6  2015  9.89  8.73   3.81  0.305
rain_analysis %>% 
  ggplot(aes(x = yr, y = z_score)) +
  geom_point() +
  geom_line(size = .1) +
  geom_hline(yintercept = 0)

Bar plot of how many days in each month of 2021 that were exactly 75 degrees with no rain.

sd_airport %>% 
  mutate(mo = month(DATE, label = TRUE), yr = year(DATE)) %>% 
  filter(yr == 2021 & TMAX == 75 & PRCP == 0) %>% 
  glimpse() %>%
  ggplot(aes(x = mo)) + 
  geom_bar()
## Rows: 19
## Columns: 8
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188", "USW00023188", "U~
## $ NAME    <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE    <date> 2021-02-23, 2021-06-12, 2021-06-13, 2021-06-14, 2021-06-21, 2~
## $ PRCP    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ TMAX    <dbl> 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75~
## $ TMIN    <dbl> 47, 59, 64, 63, 65, 65, 66, 67, 67, 69, 70, 67, 66, 67, 67, 64~
## $ mo      <ord> Feb, Jun, Jun, Jun, Jun, Jun, Jul, Jul, Jul, Jul, Jul, Aug, Se~
## $ yr      <dbl> 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 20~

Bar plot of how many days in each month of 2021 were between 75-85 degrees with no rain.

sd_airport %>% 
  mutate(mo = month(DATE, label = TRUE), yr = year(DATE)) %>% 
  filter(yr == 2021 & TMAX >= 75 & TMAX <= 85 & PRCP == 0) %>% 
  glimpse() %>%
  ggplot(aes(x = mo)) + 
  geom_bar()
## Rows: 110
## Columns: 8
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188", "USW00023188", "U~
## $ NAME    <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE    <date> 2021-01-14, 2021-01-17, 2021-02-21, 2021-02-22, 2021-02-23, 2~
## $ PRCP    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ TMAX    <dbl> 81, 82, 82, 78, 75, 77, 82, 82, 80, 81, 83, 76, 76, 76, 75, 75~
## $ TMIN    <dbl> 48, 52, 46, 45, 47, 46, 50, 59, 54, 55, 59, 57, 59, 58, 59, 64~
## $ mo      <ord> Jan, Jan, Feb, Feb, Feb, Mar, Mar, Apr, Apr, Apr, Apr, May, Ma~
## $ yr      <dbl> 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 20~

Bar plot of how many days in each month since 1941 where the minimum temperature was 32 degrees or less.

sd_airport %>% 
  mutate(mo = month(DATE, label = TRUE)) %>% 
  filter(TMIN <= 32) %>% 
  glimpse() %>%
  ggplot(aes(x = mo)) + 
  geom_bar()
## Rows: 3
## Columns: 7
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188"
## $ NAME    <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE    <date> 1949-01-04, 1949-01-05, 1963-01-13
## $ PRCP    <dbl> 0, 0, 0
## $ TMAX    <dbl> 52, 59, 60
## $ TMIN    <dbl> 29, 30, 31
## $ mo      <ord> Jan, Jan, Jan