Build SD2310

Harold Nelson

2023-10-17

Setup

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1     ✔ purrr   1.0.1
## ✔ tibble  3.2.1     ✔ dplyr   1.1.1
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Import the Data

Solution

 SD2310 <- read_csv("~/Downloads/3487998.csv", col_types = cols(DATE = col_character()))

Examine the Data

Solution

summary(SD2310)
##    STATION              NAME               DATE                PRCP        
##  Length:30783       Length:30783       Length:30783       Min.   :0.00000  
##  Class :character   Class :character   Class :character   1st Qu.:0.00000  
##  Mode  :character   Mode  :character   Mode  :character   Median :0.00000  
##                                                           Mean   :0.02709  
##                                                           3rd Qu.:0.00000  
##                                                           Max.   :2.70000  
##                                                                            
##       TMAX             TMIN      
##  Min.   : 46.00   Min.   :29.00  
##  1st Qu.: 66.00   1st Qu.:52.00  
##  Median : 70.00   Median :58.00  
##  Mean   : 70.72   Mean   :57.27  
##  3rd Qu.: 75.00   3rd Qu.:63.00  
##  Max.   :111.00   Max.   :78.00  
##  NA's   :1        NA's   :1
glimpse(SD2310)
## Rows: 30,783
## Columns: 6
## $ STATION <chr> "USW00023188", "USW00023188", "USW00023188", "USW00023188", "U…
## $ NAME    <chr> "SAN DIEGO INTERNATIONAL AIRPORT, CA US", "SAN DIEGO INTERNATI…
## $ DATE    <chr> "1939-07-01", "1939-07-02", "1939-07-03", "1939-07-04", "1939-…
## $ PRCP    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ TMAX    <dbl> 76, 74, 71, 71, 72, 72, 75, 75, 76, 76, 80, 79, 73, 74, 71, 70…
## $ TMIN    <dbl> 63, 65, 62, 63, 64, 65, 61, 63, 65, 65, 63, 64, 63, 62, 60, 59…

When NA?

Solution

SD2310 %>% 
  filter(is.na(PRCP) | is.na(TMAX) | is.na(TMIN))
## # A tibble: 2 × 6
##   STATION     NAME                                   DATE       PRCP  TMAX  TMIN
##   <chr>       <chr>                                  <chr>     <dbl> <dbl> <dbl>
## 1 USW00023188 SAN DIEGO INTERNATIONAL AIRPORT, CA US 1946-02-…     0    64    NA
## 2 USW00023188 SAN DIEGO INTERNATIONAL AIRPORT, CA US 1946-07-…     0    NA    65

Get Rid of NA Values

Solution

SD2310 = SD2310%>% 
  drop_na()

summary(SD2310)
##    STATION              NAME               DATE                PRCP        
##  Length:30781       Length:30781       Length:30781       Min.   :0.00000  
##  Class :character   Class :character   Class :character   1st Qu.:0.00000  
##  Mode  :character   Mode  :character   Mode  :character   Median :0.00000  
##                                                           Mean   :0.02709  
##                                                           3rd Qu.:0.00000  
##                                                           Max.   :2.70000  
##       TMAX             TMIN      
##  Min.   : 46.00   Min.   :29.00  
##  1st Qu.: 66.00   1st Qu.:52.00  
##  Median : 70.00   Median :58.00  
##  Mean   : 70.72   Mean   :57.27  
##  3rd Qu.: 75.00   3rd Qu.:63.00  
##  Max.   :111.00   Max.   :78.00

Drop Name, Station, and SNOW

Solution

SD2310 = SD2310 %>% 
  select(DATE, PRCP, TMAX, TMIN)

summary(SD2310)
##      DATE                PRCP              TMAX             TMIN      
##  Length:30781       Min.   :0.00000   Min.   : 46.00   Min.   :29.00  
##  Class :character   1st Qu.:0.00000   1st Qu.: 66.00   1st Qu.:52.00  
##  Mode  :character   Median :0.00000   Median : 70.00   Median :58.00  
##                     Mean   :0.02709   Mean   : 70.72   Mean   :57.27  
##                     3rd Qu.:0.00000   3rd Qu.: 75.00   3rd Qu.:63.00  
##                     Max.   :2.70000   Max.   :111.00   Max.   :78.00

Add Date Info

Solution

SD2310 = SD2310 %>% 
  mutate(yr = year(DATE),
         mo = month(DATE),
         mo = factor(mo),
         dy = day(DATE))

summary(SD2310)
##      DATE                PRCP              TMAX             TMIN      
##  Length:30781       Min.   :0.00000   Min.   : 46.00   Min.   :29.00  
##  Class :character   1st Qu.:0.00000   1st Qu.: 66.00   1st Qu.:52.00  
##  Mode  :character   Median :0.00000   Median : 70.00   Median :58.00  
##                     Mean   :0.02709   Mean   : 70.72   Mean   :57.27  
##                     3rd Qu.:0.00000   3rd Qu.: 75.00   3rd Qu.:63.00  
##                     Max.   :2.70000   Max.   :111.00   Max.   :78.00  
##                                                                       
##        yr             mo              dy       
##  Min.   :1939   8      : 2635   Min.   : 1.00  
##  1st Qu.:1960   7      : 2634   1st Qu.: 8.00  
##  Median :1981   10     : 2617   Median :16.00  
##  Mean   :1981   1      : 2604   Mean   :15.73  
##  3rd Qu.:2002   3      : 2604   3rd Qu.:23.00  
##  Max.   :2023   5      : 2604   Max.   :31.00  
##                 (Other):15083
head(SD2310)
## # A tibble: 6 × 7
##   DATE        PRCP  TMAX  TMIN    yr mo       dy
##   <chr>      <dbl> <dbl> <dbl> <dbl> <fct> <int>
## 1 1939-07-01     0    76    63  1939 7         1
## 2 1939-07-02     0    74    65  1939 7         2
## 3 1939-07-03     0    71    62  1939 7         3
## 4 1939-07-04     0    71    63  1939 7         4
## 5 1939-07-05     0    72    64  1939 7         5
## 6 1939-07-06     0    72    65  1939 7         6

Save the file

# save(SD2310,file = "SD2310.Rdata")

Save the Dataframe

Solution

# save(SD2310,file = "SD2310.Rdata")
# Comment to not do this while knitting