library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readr)
sandiego_airport <- read_csv("2933363.csv", col_types = cols(DATE = col_character()))
glimpse(sandiego_airport)
## Rows: 29,543
## Columns: 6
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188", "USW00023188", "U~
## $ NAME    <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE    <chr> "1941-05-13", "1941-05-14", "1941-05-15", "1941-05-16", "1941-~
## $ PRCP    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ TMAX    <dbl> 74, 73, 74, 74, 70, 68, 81, 84, 85, 75, 75, 74, 86, 80, 75, 73~
## $ TMIN    <dbl> 59, 60, 58, 61, 60, 59, 54, 56, 58, 60, 63, 61, 60, 60, 58, 62~
sandiego_airport$DATE = as.Date(sandiego_airport$DATE)
glimpse(sandiego_airport)
## Rows: 29,543
## Columns: 6
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188", "USW00023188", "U~
## $ NAME    <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE    <date> 1941-05-13, 1941-05-14, 1941-05-15, 1941-05-16, 1941-05-17, 1~
## $ PRCP    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ TMAX    <dbl> 74, 73, 74, 74, 70, 68, 81, 84, 85, 75, 75, 74, 86, 80, 75, 73~
## $ TMIN    <dbl> 59, 60, 58, 61, 60, 59, 54, 56, 58, 60, 63, 61, 60, 60, 58, 62~
summary(sandiego_airport)
##    STATION              NAME                DATE                 PRCP        
##  Length:29543       Length:29543       Min.   :1941-05-13   Min.   :0.00000  
##  Class :character   Class :character   1st Qu.:1961-08-04   1st Qu.:0.00000  
##  Mode  :character   Mode  :character   Median :1981-10-24   Median :0.00000  
##                                        Mean   :1981-10-23   Mean   :0.02641  
##                                        3rd Qu.:2002-01-12   3rd Qu.:0.00000  
##                                        Max.   :2022-04-03   Max.   :2.70000  
##                                                             NA's   :2        
##       TMAX             TMIN      
##  Min.   : 46.00   Min.   :29.00  
##  1st Qu.: 66.00   1st Qu.:52.00  
##  Median : 70.00   Median :58.00  
##  Mean   : 70.69   Mean   :57.27  
##  3rd Qu.: 75.00   3rd Qu.:63.00  
##  Max.   :111.00   Max.   :78.00  
##  NA's   :3        NA's   :3
sandiego_airport %>% filter(is.na(TMAX) |
                       is.na(TMIN) |
                       is.na(PRCP))
## # A tibble: 4 x 6
##   STATION     NAME                                  DATE        PRCP  TMAX  TMIN
##   <chr>       <chr>                                 <date>     <dbl> <dbl> <dbl>
## 1 USW00023188 SAN DIEGO INTERNATIONAL AIRPORT, CA ~ 1946-02-23     0    64    NA
## 2 USW00023188 SAN DIEGO INTERNATIONAL AIRPORT, CA ~ 1946-07-17     0    NA    65
## 3 USW00023188 SAN DIEGO INTERNATIONAL AIRPORT, CA ~ 2022-03-24    NA    NA    NA
## 4 USW00023188 SAN DIEGO INTERNATIONAL AIRPORT, CA ~ 2022-04-03    NA    NA    NA

#There is missing data from periods in 1946 and 2022

sandiego_airport = sandiego_airport %>% drop_na()
summary(sandiego_airport)
##    STATION              NAME                DATE                 PRCP        
##  Length:29539       Length:29539       Min.   :1941-05-13   Min.   :0.00000  
##  Class :character   Class :character   1st Qu.:1961-08-05   1st Qu.:0.00000  
##  Mode  :character   Mode  :character   Median :1981-10-24   Median :0.00000  
##                                        Mean   :1981-10-23   Mean   :0.02642  
##                                        3rd Qu.:2002-01-11   3rd Qu.:0.00000  
##                                        Max.   :2022-04-02   Max.   :2.70000  
##       TMAX             TMIN      
##  Min.   : 46.00   Min.   :29.00  
##  1st Qu.: 66.00   1st Qu.:52.00  
##  Median : 70.00   Median :58.00  
##  Mean   : 70.69   Mean   :57.27  
##  3rd Qu.: 75.00   3rd Qu.:63.00  
##  Max.   :111.00   Max.   :78.00

#We will now get the density with rug plots for TMAX and TMIN

sandiego_airport %>% 
  ggplot(aes(x = TMAX)) +
  geom_density() +
  geom_rug() +
  ggtitle("TMAX")

#Looking at the graph, we can see the desity decreasing then rising a bit before fully going down.

sandiego_airport %>% 
  ggplot(aes(x = TMIN)) +
  geom_density() +
  geom_rug() +
  ggtitle("TMIN")

#Here we can see that is a primary peak with weak shoulders to the left and right of it

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
sandiego_airport = sandiego_airport %>%
  mutate(yr = factor(year(DATE)),
         mo = factor(month(DATE)),
         dy = factor(day(DATE)))
glimpse(sandiego_airport)
## Rows: 29,539
## Columns: 9
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188", "USW00023188", "U~
## $ NAME    <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE    <date> 1941-05-13, 1941-05-14, 1941-05-15, 1941-05-16, 1941-05-17, 1~
## $ PRCP    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ TMAX    <dbl> 74, 73, 74, 74, 70, 68, 81, 84, 85, 75, 75, 74, 86, 80, 75, 73~
## $ TMIN    <dbl> 59, 60, 58, 61, 60, 59, 54, 56, 58, 60, 63, 61, 60, 60, 58, 62~
## $ yr      <fct> 1941, 1941, 1941, 1941, 1941, 1941, 1941, 1941, 1941, 1941, 19~
## $ mo      <fct> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,~
## $ dy      <fct> 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28~
summary(sandiego_airport)
##    STATION              NAME                DATE                 PRCP        
##  Length:29539       Length:29539       Min.   :1941-05-13   Min.   :0.00000  
##  Class :character   Class :character   1st Qu.:1961-08-05   1st Qu.:0.00000  
##  Mode  :character   Mode  :character   Median :1981-10-24   Median :0.00000  
##                                        Mean   :1981-10-23   Mean   :0.02642  
##                                        3rd Qu.:2002-01-11   3rd Qu.:0.00000  
##                                        Max.   :2022-04-02   Max.   :2.70000  
##                                                                              
##       TMAX             TMIN             yr              mo       
##  Min.   : 46.00   Min.   :29.00   1944   :  366   1      : 2511  
##  1st Qu.: 66.00   1st Qu.:52.00   1948   :  366   8      : 2511  
##  Median : 70.00   Median :58.00   1952   :  366   10     : 2511  
##  Mean   : 70.69   Mean   :57.27   1956   :  366   12     : 2511  
##  3rd Qu.: 75.00   3rd Qu.:63.00   1960   :  366   3      : 2510  
##  Max.   :111.00   Max.   :78.00   1964   :  366   7      : 2510  
##                                   (Other):27343   (Other):14475  
##        dy       
##  1      :  971  
##  2      :  971  
##  13     :  971  
##  14     :  971  
##  15     :  971  
##  16     :  971  
##  (Other):23713
save(sandiego_airport, file = "sandiego_airport.Rdata")
rain_5_8 = sandiego_airport %>% 
  select(DATE,PRCP) %>% 
  mutate(mo = month(DATE),
         yr = year(DATE) )%>% 
  filter(mo >= 5 & mo <= 8) %>% 
  group_by(yr) %>% 
  summarize(rain = sum(PRCP)) %>% 
  ungroup() %>% 
  mutate(mrain = mean(rain),
         sdrain  = sd(rain),
         z_score = (rain - mean(rain))/sdrain) 

head(rain_5_8)
## # A tibble: 6 x 5
##      yr  rain mrain sdrain z_score
##   <dbl> <dbl> <dbl>  <dbl>   <dbl>
## 1  1941  0.09 0.372  0.672 -0.419 
## 2  1942  0.12 0.372  0.672 -0.375 
## 3  1943  0.03 0.372  0.672 -0.509 
## 4  1944  0.32 0.372  0.672 -0.0770
## 5  1945  1.06 0.372  0.672  1.02  
## 6  1946  0.01 0.372  0.672 -0.538
rain_5_8 %>% 
  ggplot(aes(x = yr, y = z_score)) +
      geom_point() +
      geom_line(size = .1) 

summary(rain_5_8)
##        yr            rain            mrain            sdrain      
##  Min.   :1941   Min.   :0.0000   Min.   :0.3717   Min.   :0.6719  
##  1st Qu.:1961   1st Qu.:0.0400   1st Qu.:0.3717   1st Qu.:0.6719  
##  Median :1981   Median :0.1700   Median :0.3717   Median :0.6719  
##  Mean   :1981   Mean   :0.3717   Mean   :0.3717   Mean   :0.6719  
##  3rd Qu.:2001   3rd Qu.:0.4000   3rd Qu.:0.3717   3rd Qu.:0.6719  
##  Max.   :2021   Max.   :4.1500   Max.   :0.3717   Max.   :0.6719  
##     z_score        
##  Min.   :-0.55325  
##  1st Qu.:-0.49372  
##  Median :-0.30024  
##  Mean   : 0.00000  
##  3rd Qu.: 0.04208  
##  Max.   : 5.62328

#The lowest z-score was in 1941 with a z-score of -0.55325.

rain_5_8 %>% 
  ggplot(aes(x = rain)) +
      geom_histogram( )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Compared to Olympia International Airport, less rain occured at San Diego Airport.

sandiego_airport %>% 
  filter(TMAX == 75 & PRCP == 0) %>% 
  group_by(yr) %>% 
  summarize(count = n()) %>% 
  filter(count > 0) %>% 
  ungroup() %>% 
  ggplot(aes(x = factor(count))) + 
  geom_bar()

#The most common year count is 18 which shows how many in a year has San Diego have had perfect days.

sandiego_airport %>% 
  filter(TMAX == 75 & PRCP == 0) %>% 
  ggplot(aes(x = mo)) + 
  geom_bar()

#July had the most count for perfect days while August is closely relative to it.