library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readr)
sandiego_airport <- read_csv("2933363.csv", col_types = cols(DATE = col_character()))
glimpse(sandiego_airport)
## Rows: 29,543
## Columns: 6
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188", "USW00023188", "U~
## $ NAME <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE <chr> "1941-05-13", "1941-05-14", "1941-05-15", "1941-05-16", "1941-~
## $ PRCP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ TMAX <dbl> 74, 73, 74, 74, 70, 68, 81, 84, 85, 75, 75, 74, 86, 80, 75, 73~
## $ TMIN <dbl> 59, 60, 58, 61, 60, 59, 54, 56, 58, 60, 63, 61, 60, 60, 58, 62~
sandiego_airport$DATE = as.Date(sandiego_airport$DATE)
glimpse(sandiego_airport)
## Rows: 29,543
## Columns: 6
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188", "USW00023188", "U~
## $ NAME <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE <date> 1941-05-13, 1941-05-14, 1941-05-15, 1941-05-16, 1941-05-17, 1~
## $ PRCP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ TMAX <dbl> 74, 73, 74, 74, 70, 68, 81, 84, 85, 75, 75, 74, 86, 80, 75, 73~
## $ TMIN <dbl> 59, 60, 58, 61, 60, 59, 54, 56, 58, 60, 63, 61, 60, 60, 58, 62~
summary(sandiego_airport)
## STATION NAME DATE PRCP
## Length:29543 Length:29543 Min. :1941-05-13 Min. :0.00000
## Class :character Class :character 1st Qu.:1961-08-04 1st Qu.:0.00000
## Mode :character Mode :character Median :1981-10-24 Median :0.00000
## Mean :1981-10-23 Mean :0.02641
## 3rd Qu.:2002-01-12 3rd Qu.:0.00000
## Max. :2022-04-03 Max. :2.70000
## NA's :2
## TMAX TMIN
## Min. : 46.00 Min. :29.00
## 1st Qu.: 66.00 1st Qu.:52.00
## Median : 70.00 Median :58.00
## Mean : 70.69 Mean :57.27
## 3rd Qu.: 75.00 3rd Qu.:63.00
## Max. :111.00 Max. :78.00
## NA's :3 NA's :3
sandiego_airport %>% filter(is.na(TMAX) |
is.na(TMIN) |
is.na(PRCP))
## # A tibble: 4 x 6
## STATION NAME DATE PRCP TMAX TMIN
## <chr> <chr> <date> <dbl> <dbl> <dbl>
## 1 USW00023188 SAN DIEGO INTERNATIONAL AIRPORT, CA ~ 1946-02-23 0 64 NA
## 2 USW00023188 SAN DIEGO INTERNATIONAL AIRPORT, CA ~ 1946-07-17 0 NA 65
## 3 USW00023188 SAN DIEGO INTERNATIONAL AIRPORT, CA ~ 2022-03-24 NA NA NA
## 4 USW00023188 SAN DIEGO INTERNATIONAL AIRPORT, CA ~ 2022-04-03 NA NA NA
#There is missing data from periods in 1946 and 2022
sandiego_airport = sandiego_airport %>% drop_na()
summary(sandiego_airport)
## STATION NAME DATE PRCP
## Length:29539 Length:29539 Min. :1941-05-13 Min. :0.00000
## Class :character Class :character 1st Qu.:1961-08-05 1st Qu.:0.00000
## Mode :character Mode :character Median :1981-10-24 Median :0.00000
## Mean :1981-10-23 Mean :0.02642
## 3rd Qu.:2002-01-11 3rd Qu.:0.00000
## Max. :2022-04-02 Max. :2.70000
## TMAX TMIN
## Min. : 46.00 Min. :29.00
## 1st Qu.: 66.00 1st Qu.:52.00
## Median : 70.00 Median :58.00
## Mean : 70.69 Mean :57.27
## 3rd Qu.: 75.00 3rd Qu.:63.00
## Max. :111.00 Max. :78.00
#We will now get the density with rug plots for TMAX and TMIN
sandiego_airport %>%
ggplot(aes(x = TMAX)) +
geom_density() +
geom_rug() +
ggtitle("TMAX")
#Looking at the graph, we can see the desity decreasing then rising a bit before fully going down.
sandiego_airport %>%
ggplot(aes(x = TMIN)) +
geom_density() +
geom_rug() +
ggtitle("TMIN")
#Here we can see that is a primary peak with weak shoulders to the left and right of it
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
sandiego_airport = sandiego_airport %>%
mutate(yr = factor(year(DATE)),
mo = factor(month(DATE)),
dy = factor(day(DATE)))
glimpse(sandiego_airport)
## Rows: 29,539
## Columns: 9
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188", "USW00023188", "U~
## $ NAME <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI~
## $ DATE <date> 1941-05-13, 1941-05-14, 1941-05-15, 1941-05-16, 1941-05-17, 1~
## $ PRCP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ TMAX <dbl> 74, 73, 74, 74, 70, 68, 81, 84, 85, 75, 75, 74, 86, 80, 75, 73~
## $ TMIN <dbl> 59, 60, 58, 61, 60, 59, 54, 56, 58, 60, 63, 61, 60, 60, 58, 62~
## $ yr <fct> 1941, 1941, 1941, 1941, 1941, 1941, 1941, 1941, 1941, 1941, 19~
## $ mo <fct> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,~
## $ dy <fct> 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28~
summary(sandiego_airport)
## STATION NAME DATE PRCP
## Length:29539 Length:29539 Min. :1941-05-13 Min. :0.00000
## Class :character Class :character 1st Qu.:1961-08-05 1st Qu.:0.00000
## Mode :character Mode :character Median :1981-10-24 Median :0.00000
## Mean :1981-10-23 Mean :0.02642
## 3rd Qu.:2002-01-11 3rd Qu.:0.00000
## Max. :2022-04-02 Max. :2.70000
##
## TMAX TMIN yr mo
## Min. : 46.00 Min. :29.00 1944 : 366 1 : 2511
## 1st Qu.: 66.00 1st Qu.:52.00 1948 : 366 8 : 2511
## Median : 70.00 Median :58.00 1952 : 366 10 : 2511
## Mean : 70.69 Mean :57.27 1956 : 366 12 : 2511
## 3rd Qu.: 75.00 3rd Qu.:63.00 1960 : 366 3 : 2510
## Max. :111.00 Max. :78.00 1964 : 366 7 : 2510
## (Other):27343 (Other):14475
## dy
## 1 : 971
## 2 : 971
## 13 : 971
## 14 : 971
## 15 : 971
## 16 : 971
## (Other):23713
save(sandiego_airport, file = "sandiego_airport.Rdata")
rain_5_8 = sandiego_airport %>%
select(DATE,PRCP) %>%
mutate(mo = month(DATE),
yr = year(DATE) )%>%
filter(mo >= 5 & mo <= 8) %>%
group_by(yr) %>%
summarize(rain = sum(PRCP)) %>%
ungroup() %>%
mutate(mrain = mean(rain),
sdrain = sd(rain),
z_score = (rain - mean(rain))/sdrain)
head(rain_5_8)
## # A tibble: 6 x 5
## yr rain mrain sdrain z_score
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1941 0.09 0.372 0.672 -0.419
## 2 1942 0.12 0.372 0.672 -0.375
## 3 1943 0.03 0.372 0.672 -0.509
## 4 1944 0.32 0.372 0.672 -0.0770
## 5 1945 1.06 0.372 0.672 1.02
## 6 1946 0.01 0.372 0.672 -0.538
rain_5_8 %>%
ggplot(aes(x = yr, y = z_score)) +
geom_point() +
geom_line(size = .1)
summary(rain_5_8)
## yr rain mrain sdrain
## Min. :1941 Min. :0.0000 Min. :0.3717 Min. :0.6719
## 1st Qu.:1961 1st Qu.:0.0400 1st Qu.:0.3717 1st Qu.:0.6719
## Median :1981 Median :0.1700 Median :0.3717 Median :0.6719
## Mean :1981 Mean :0.3717 Mean :0.3717 Mean :0.6719
## 3rd Qu.:2001 3rd Qu.:0.4000 3rd Qu.:0.3717 3rd Qu.:0.6719
## Max. :2021 Max. :4.1500 Max. :0.3717 Max. :0.6719
## z_score
## Min. :-0.55325
## 1st Qu.:-0.49372
## Median :-0.30024
## Mean : 0.00000
## 3rd Qu.: 0.04208
## Max. : 5.62328
#The lowest z-score was in 1941 with a z-score of -0.55325.
rain_5_8 %>%
ggplot(aes(x = rain)) +
geom_histogram( )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Compared to Olympia International Airport, less rain occured at San Diego Airport.
sandiego_airport %>%
filter(TMAX == 75 & PRCP == 0) %>%
group_by(yr) %>%
summarize(count = n()) %>%
filter(count > 0) %>%
ungroup() %>%
ggplot(aes(x = factor(count))) +
geom_bar()
#The most common year count is 18 which shows how many in a year has San Diego have had perfect days.
sandiego_airport %>%
filter(TMAX == 75 & PRCP == 0) %>%
ggplot(aes(x = mo)) +
geom_bar()
#July had the most count for perfect days while August is closely relative to it.