Build OAW23

Harold Nelson

2023-02-10

Setup

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Import the Data

Solution

OAW23 <- read_csv("~/Downloads/3229019.csv")
## Rows: 29853 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): STATION, NAME
## dbl  (3): PRCP, TMAX, TMIN
## date (1): DATE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Examine the Data

Solution

summary(OAW23)
##    STATION              NAME                DATE                 PRCP       
##  Length:29853       Length:29853       Min.   :1941-05-13   Min.   :0.0000  
##  Class :character   Class :character   1st Qu.:1961-10-18   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Median :1982-03-25   Median :0.0000  
##                                        Mean   :1982-03-25   Mean   :0.1367  
##                                        3rd Qu.:2002-08-30   3rd Qu.:0.1400  
##                                        Max.   :2023-02-04   Max.   :4.8200  
##                                                             NA's   :3       
##       TMAX             TMIN      
##  Min.   : 18.00   Min.   :-8.00  
##  1st Qu.: 50.00   1st Qu.:33.00  
##  Median : 59.00   Median :40.00  
##  Mean   : 60.56   Mean   :39.83  
##  3rd Qu.: 71.00   3rd Qu.:47.00  
##  Max.   :110.00   Max.   :69.00  
##  NA's   :11       NA's   :11
glimpse(OAW23)
## Rows: 29,853
## Columns: 6
## $ STATION <chr> "USW00024227", "USW00024227", "USW00024227", "USW00024227", "U…
## $ NAME    <chr> "OLYMPIA AIRPORT, WA US", "OLYMPIA AIRPORT, WA US", "OLYMPIA A…
## $ DATE    <date> 1941-05-13, 1941-05-14, 1941-05-15, 1941-05-16, 1941-05-17, 1…
## $ PRCP    <dbl> 0.00, 0.00, 0.30, 1.08, 0.06, 0.00, 0.00, 0.00, 0.00, 0.00, 0.…
## $ TMAX    <dbl> 66, 63, 58, 55, 57, 59, 58, 65, 68, 85, 84, 75, 72, 59, 61, 59…
## $ TMIN    <dbl> 50, 47, 44, 45, 46, 39, 40, 50, 42, 46, 46, 50, 41, 37, 48, 46…

When NA?

Solution

OAW23 %>% 
  filter(is.na(PRCP) | is.na(TMAX) | is.na(TMIN))
## # A tibble: 14 × 6
##    STATION     NAME                   DATE        PRCP  TMAX  TMIN
##    <chr>       <chr>                  <date>     <dbl> <dbl> <dbl>
##  1 USW00024227 OLYMPIA AIRPORT, WA US 1996-01-24 NA       39    33
##  2 USW00024227 OLYMPIA AIRPORT, WA US 1996-07-03  0.12    67    NA
##  3 USW00024227 OLYMPIA AIRPORT, WA US 1996-12-26 NA       NA    NA
##  4 USW00024227 OLYMPIA AIRPORT, WA US 1996-12-27 NA       NA    NA
##  5 USW00024227 OLYMPIA AIRPORT, WA US 1997-04-06  0       NA    28
##  6 USW00024227 OLYMPIA AIRPORT, WA US 1997-04-07  0       61    NA
##  7 USW00024227 OLYMPIA AIRPORT, WA US 1997-04-12  0       NA    28
##  8 USW00024227 OLYMPIA AIRPORT, WA US 1997-04-13  0.39    NA    NA
##  9 USW00024227 OLYMPIA AIRPORT, WA US 1997-04-14  0.35    NA    NA
## 10 USW00024227 OLYMPIA AIRPORT, WA US 1997-05-07  0       NA    NA
## 11 USW00024227 OLYMPIA AIRPORT, WA US 1997-05-08  0       NA    NA
## 12 USW00024227 OLYMPIA AIRPORT, WA US 1997-05-09  0       NA    NA
## 13 USW00024227 OLYMPIA AIRPORT, WA US 1997-05-13  0       NA    NA
## 14 USW00024227 OLYMPIA AIRPORT, WA US 1997-05-14  0       NA    NA

Get Rid of NA Values

Solution

OAW23 = OAW23 %>% 
  drop_na()

summary(OAW23)
##    STATION              NAME                DATE                 PRCP       
##  Length:29839       Length:29839       Min.   :1941-05-13   Min.   :0.0000  
##  Class :character   Class :character   1st Qu.:1961-10-14   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Median :1982-03-18   Median :0.0000  
##                                        Mean   :1982-03-22   Mean   :0.1367  
##                                        3rd Qu.:2002-09-02   3rd Qu.:0.1400  
##                                        Max.   :2023-02-04   Max.   :4.8200  
##       TMAX             TMIN      
##  Min.   : 18.00   Min.   :-8.00  
##  1st Qu.: 50.00   1st Qu.:33.00  
##  Median : 59.00   Median :40.00  
##  Mean   : 60.56   Mean   :39.83  
##  3rd Qu.: 71.00   3rd Qu.:47.00  
##  Max.   :110.00   Max.   :69.00

Drop Name and Station

Solution

OAW23 = OAW23 %>% 
  select(DATE, PRCP, TMAX, TMIN)

summary(OAW23)
##       DATE                 PRCP             TMAX             TMIN      
##  Min.   :1941-05-13   Min.   :0.0000   Min.   : 18.00   Min.   :-8.00  
##  1st Qu.:1961-10-14   1st Qu.:0.0000   1st Qu.: 50.00   1st Qu.:33.00  
##  Median :1982-03-18   Median :0.0000   Median : 59.00   Median :40.00  
##  Mean   :1982-03-22   Mean   :0.1367   Mean   : 60.56   Mean   :39.83  
##  3rd Qu.:2002-09-02   3rd Qu.:0.1400   3rd Qu.: 71.00   3rd Qu.:47.00  
##  Max.   :2023-02-04   Max.   :4.8200   Max.   :110.00   Max.   :69.00

Add Date Info

Do a little research on the lubridate functions year(), month(), and day(). Use the to add yr, mo, and dy to the dataframe.

Solution

OAW23 = OAW23 %>% 
  mutate(yr = year(DATE),
         mo = month(DATE),
         mo = factor(mo),
         dy = day(DATE))

summary(OAW23)
##       DATE                 PRCP             TMAX             TMIN      
##  Min.   :1941-05-13   Min.   :0.0000   Min.   : 18.00   Min.   :-8.00  
##  1st Qu.:1961-10-14   1st Qu.:0.0000   1st Qu.: 50.00   1st Qu.:33.00  
##  Median :1982-03-18   Median :0.0000   Median : 59.00   Median :40.00  
##  Mean   :1982-03-22   Mean   :0.1367   Mean   : 60.56   Mean   :39.83  
##  3rd Qu.:2002-09-02   3rd Qu.:0.1400   3rd Qu.: 71.00   3rd Qu.:47.00  
##  Max.   :2023-02-04   Max.   :4.8200   Max.   :110.00   Max.   :69.00  
##                                                                        
##        yr             mo              dy       
##  Min.   :1941   8      : 2542   Min.   : 1.00  
##  1st Qu.:1961   10     : 2542   1st Qu.: 8.00  
##  Median :1982   1      : 2541   Median :16.00  
##  Mean   :1982   7      : 2541   Mean   :15.73  
##  3rd Qu.:2002   12     : 2540   3rd Qu.:23.00  
##  Max.   :2023   5      : 2525   Max.   :31.00  
##                 (Other):14608
head(OAW23)
## # A tibble: 6 × 7
##   DATE        PRCP  TMAX  TMIN    yr mo       dy
##   <date>     <dbl> <dbl> <dbl> <dbl> <fct> <int>
## 1 1941-05-13  0       66    50  1941 5        13
## 2 1941-05-14  0       63    47  1941 5        14
## 3 1941-05-15  0.3     58    44  1941 5        15
## 4 1941-05-16  1.08    55    45  1941 5        16
## 5 1941-05-17  0.06    57    46  1941 5        17
## 6 1941-05-18  0       59    39  1941 5        18

Save the Dataframe

Solution

# save(OAW23,file = "OAW23.Rdata")
# Comment to not do this while knitting