Libraries

library(tidyverse)
## ── Attaching packages ───────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ──────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
read_station = function(ID){
  url = paste0("https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/",
                      ID,
                      ".csv")
  df = read_csv(url)
  return(df) 
}

# Test this to get the data for the Olympia airport
# First all of the data
OAP_Raw1 = read_station("USW00024227")
## Parsed with column specification:
## cols(
##   .default = col_logical(),
##   STATION = col_character(),
##   DATE = col_date(format = ""),
##   LATITUDE = col_double(),
##   LONGITUDE = col_double(),
##   ELEVATION = col_double(),
##   NAME = col_character(),
##   PRCP = col_double(),
##   PRCP_ATTRIBUTES = col_character(),
##   SNOW = col_double(),
##   SNOW_ATTRIBUTES = col_character(),
##   SNWD = col_double(),
##   SNWD_ATTRIBUTES = col_character(),
##   TMAX = col_double(),
##   TMAX_ATTRIBUTES = col_character(),
##   TMIN = col_double(),
##   TMIN_ATTRIBUTES = col_character()
## )
## See spec(...) for full column specifications.
## Warning: 318130 parsing failures.
##  row             col           expected actual                                                                                                file
## 2425 WT01_ATTRIBUTES 1/0/T/F/TRUE/FALSE    ,,X 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/USW00024227.csv'
## 2425 WT16_ATTRIBUTES 1/0/T/F/TRUE/FALSE    ,,X 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/USW00024227.csv'
## 2426 WT16_ATTRIBUTES 1/0/T/F/TRUE/FALSE    ,,X 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/USW00024227.csv'
## 2427 WT01_ATTRIBUTES 1/0/T/F/TRUE/FALSE    ,,X 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/USW00024227.csv'
## 2427 WT16_ATTRIBUTES 1/0/T/F/TRUE/FALSE    ,,X 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/USW00024227.csv'
## .... ............... .................. ...... ...................................................................................................
## See problems(...) for more details.
# Keep just the variables we want
OAP_Raw2 = OAP_Raw1 %>% select(STATION,DATE,NAME,PRCP,SNOW,TMAX,TMIN)

# Look at the data and think about units of measurement
summary(OAP_Raw2)
##    STATION               DATE                NAME                PRCP        
##  Length:28756       Min.   :1941-05-13   Length:28756       Min.   :   0.00  
##  Class :character   1st Qu.:1961-01-16   Class :character   1st Qu.:   0.00  
##  Mode  :character   Median :1980-09-22   Mode  :character   Median :   0.00  
##                     Mean   :1980-09-22                      Mean   :  34.73  
##                     3rd Qu.:2000-05-29                      3rd Qu.:  36.00  
##                     Max.   :2020-02-03                      Max.   :1224.00  
##                                                             NA's   :4        
##       SNOW              TMAX            TMIN        
##  Min.   :  0.000   Min.   :-78.0   Min.   :-222.00  
##  1st Qu.:  0.000   1st Qu.:100.0   1st Qu.:   6.00  
##  Median :  0.000   Median :150.0   Median :  44.00  
##  Mean   :  1.012   Mean   :158.5   Mean   :  43.36  
##  3rd Qu.:  0.000   3rd Qu.:217.0   3rd Qu.:  83.00  
##  Max.   :361.000   Max.   :400.0   Max.   : 206.00  
##  NA's   :5416      NA's   :12      NA's   :12

Some observations:

  1. The temperatures are in degrees celsius multiplied by 10.

  2. The PRCP and SNOW values are in some metric units. To convert to inches we need to divide by 254.

  3. The variable DATE is a legitimate date and we can use the lubridate functions to get its components.

  4. The variable SNOW has a lot of NA values. We need to look at it carefully.

Transformations

With these observations in mind, we can get OAP_Raw3, a third version of our data.

OAP_Raw2 %>% mutate(TMAX = .1 * 1.8 * TMAX + 32,
                    TMIN = .1 * 1.8 * TMIN + 32,
                    PRCP = PRCP/254,
                    SNOW = SNOW/254,
                    yr = year(DATE),
                    mo = month(DATE),
                    dy = day(DATE)) -> OAP_Raw3
summary(OAP_Raw3)
##    STATION               DATE                NAME                PRCP       
##  Length:28756       Min.   :1941-05-13   Length:28756       Min.   :0.0000  
##  Class :character   1st Qu.:1961-01-16   Class :character   1st Qu.:0.0000  
##  Mode  :character   Median :1980-09-22   Mode  :character   Median :0.0000  
##                     Mean   :1980-09-22                      Mean   :0.1367  
##                     3rd Qu.:2000-05-29                      3rd Qu.:0.1417  
##                     Max.   :2020-02-03                      Max.   :4.8189  
##                                                             NA's   :4       
##       SNOW            TMAX             TMIN             yr      
##  Min.   :0.000   Min.   : 17.96   Min.   :-7.96   Min.   :1941  
##  1st Qu.:0.000   1st Qu.: 50.00   1st Qu.:33.08   1st Qu.:1961  
##  Median :0.000   Median : 59.00   Median :39.92   Median :1980  
##  Mean   :0.004   Mean   : 60.52   Mean   :39.80   Mean   :1980  
##  3rd Qu.:0.000   3rd Qu.: 71.06   3rd Qu.:46.94   3rd Qu.:2000  
##  Max.   :1.421   Max.   :104.00   Max.   :69.08   Max.   :2020  
##  NA's   :5416    NA's   :12       NA's   :12                    
##        mo               dy       
##  Min.   : 1.000   Min.   : 1.00  
##  1st Qu.: 4.000   1st Qu.: 8.00  
##  Median : 7.000   Median :16.00  
##  Mean   : 6.534   Mean   :15.73  
##  3rd Qu.:10.000   3rd Qu.:23.00  
##  Max.   :12.000   Max.   :31.00  
## 

Observations:

All of the meteorological variables have some missing data, but the amount of missing snow data is larger than the others. In some cases data might be missing as a sloppy alternative to zero. We need to see if this replacement would solve our problems. First, let’s look at the SNOW variable.

bad_snow = OAP_Raw3 %>% 
  select(SNOW,DATE) %>% 
  filter(is.na(SNOW))
summary(bad_snow)
##       SNOW           DATE           
##  Min.   : NA    Min.   :1996-02-02  
##  1st Qu.: NA    1st Qu.:2000-07-06  
##  Median : NA    Median :2004-08-21  
##  Mean   :NaN    Mean   :2004-07-09  
##  3rd Qu.: NA    3rd Qu.:2008-05-06  
##  Max.   : NA    Max.   :2020-02-03  
##  NA's   :5416

It looks like the bad snow data began in 1996. There is so much bad data that we should see if all of the recent snow data is bad. Go back to the complete data and check for positive snow values after 1996.

recent_snow = OAP_Raw3 %>% filter(yr > 1996 & SNOW > 0)
summary(recent_snow)
##    STATION               DATE                NAME                PRCP        
##  Length:8           Min.   :2012-01-15   Length:8           Min.   :0.09843  
##  Class :character   1st Qu.:2012-01-17   Class :character   1st Qu.:0.29331  
##  Mode  :character   Median :2012-02-15   Mode  :character   Median :0.36024  
##                     Mean   :2012-09-10                      Mean   :0.51624  
##                     3rd Qu.:2013-03-31                      3rd Qu.:0.74606  
##                     Max.   :2014-02-09                      Max.   :1.09843  
##       SNOW             TMAX            TMIN             yr      
##  Min.   :0.0315   Min.   :30.92   Min.   :17.96   Min.   :2012  
##  1st Qu.:0.0935   1st Qu.:33.48   1st Qu.:24.44   1st Qu.:2012  
##  Median :0.1594   Median :35.51   Median :30.02   Median :2012  
##  Mean   :0.2682   Mean   :36.48   Mean   :27.18   Mean   :2012  
##  3rd Qu.:0.2303   3rd Qu.:38.44   3rd Qu.:30.65   3rd Qu.:2012  
##  Max.   :1.0984   Max.   :46.04   Max.   :32.00   Max.   :2014  
##        mo               dy       
##  Min.   : 1.000   Min.   : 8.00  
##  1st Qu.: 1.000   1st Qu.:12.00  
##  Median : 1.500   Median :15.50  
##  Mean   : 2.875   Mean   :14.38  
##  3rd Qu.: 2.250   3rd Qu.:17.25  
##  Max.   :12.000   Max.   :19.00

This looks bad. We know that there has been snow in Olympia after 2014. The variable SNOW needs to be dropped.

What about the missing TMAX, TMIN, and PRCP? Look at when they happened.

bad_data = OAP_Raw3 %>% 
  filter(is.na(TMAX) | is.na(TMIN) | is.na(PRCP))
summary(bad_data)
##    STATION               DATE                NAME                PRCP        
##  Length:15          Min.   :1996-01-24   Length:15          Min.   :0.00000  
##  Class :character   1st Qu.:1997-02-15   Class :character   1st Qu.:0.00000  
##  Mode  :character   Median :1997-04-13   Mode  :character   Median :0.00000  
##                     Mean   :1998-08-26                      Mean   :0.07802  
##                     3rd Qu.:1997-05-08                      3rd Qu.:0.05906  
##                     Max.   :2020-02-03                      Max.   :0.38976  
##                                                             NA's   :4        
##       SNOW         TMAX            TMIN             yr             mo        
##  Min.   :0    Min.   :39.02   Min.   :28.04   Min.   :1996   Min.   : 1.000  
##  1st Qu.:0    1st Qu.:50.00   1st Qu.:28.04   1st Qu.:1996   1st Qu.: 4.000  
##  Median :0    Median :60.98   Median :28.04   Median :1997   Median : 5.000  
##  Mean   :0    Mean   :55.64   Mean   :29.72   Mean   :1998   Mean   : 5.267  
##  3rd Qu.:0    3rd Qu.:63.95   3rd Qu.:30.56   3rd Qu.:1997   3rd Qu.: 5.000  
##  Max.   :0    Max.   :66.92   Max.   :33.08   Max.   :2020   Max.   :12.000  
##  NA's   :13   NA's   :12      NA's   :12                                     
##        dy      
##  Min.   : 3.0  
##  1st Qu.: 7.0  
##  Median :12.0  
##  Mean   :12.4  
##  3rd Qu.:14.0  
##  Max.   :27.0  
## 
# View(bad_data) Done in console

Since the amount of missing data is fairly small, we can safely drop it. If we were doing daily time-series analysis, we could do something more complicated and replace the missing data using the remainder of the data.

Now we can produce the final data file.

OAP = OAP_Raw3 %>% 
  select(-SNOW) %>% 
  na.omit()
summary(OAP)
##    STATION               DATE                NAME                PRCP       
##  Length:28741       Min.   :1941-05-13   Length:28741       Min.   :0.0000  
##  Class :character   1st Qu.:1961-01-13   Class :character   1st Qu.:0.0000  
##  Mode  :character   Median :1980-09-15   Mode  :character   Median :0.0000  
##                     Mean   :1980-09-19                      Mean   :0.1368  
##                     3rd Qu.:2000-06-01                      3rd Qu.:0.1417  
##                     Max.   :2020-02-02                      Max.   :4.8189  
##       TMAX             TMIN             yr             mo        
##  Min.   : 17.96   Min.   :-7.96   Min.   :1941   Min.   : 1.000  
##  1st Qu.: 50.00   1st Qu.:33.08   1st Qu.:1961   1st Qu.: 4.000  
##  Median : 59.00   Median :39.92   Median :1980   Median : 7.000  
##  Mean   : 60.52   Mean   :39.81   Mean   :1980   Mean   : 6.535  
##  3rd Qu.: 71.06   3rd Qu.:46.94   3rd Qu.:2000   3rd Qu.:10.000  
##  Max.   :104.00   Max.   :69.08   Max.   :2020   Max.   :12.000  
##        dy       
##  Min.   : 1.00  
##  1st Qu.: 8.00  
##  Median :16.00  
##  Mean   :15.73  
##  3rd Qu.:23.00  
##  Max.   :31.00