Load Libraries

library(tidyverse)
## ── Attaching packages ──────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(readr)

Get Stations File

This txt file was downloaded from https://www.ncdc.noaa.gov/ghcnd-data-access.

The file is in a fortran style format. The code to read it was taken from http://spatialreasoning.com/wp/20170307_1244_r-reading-filtering-weather-data-from-the-global-historical-climatology-network-ghcn.

The following code reads the txt ghcnd_stations.txt file and creates the R dataframe stations.

typedcols <- c( "A11", "F9", "F10", "F7", "X1","A2",
                  "X1","A30", "X1", "A3", "X1", "A3", "X1", "A5" )
stations <- read.fortran("ghcnd-stations.txt",
                             typedcols, 
                             comment.char="")
hdrs <- c("ID", "LAT", "LON", "ELEV", "ST", "NAME","GSN", "HCN", "WMOID")
names(stations) <- hdrs

Get Inventory

The file ghcnd-inventory.txt was downloaded from the same location as the stations file.

After reading the file, the following code selects only the rows which indicate the presence of the daily maximum temperature, TMAX. The file contains a row for every combination of station, time period, and weather measure. The code keeps only one row per station and time period. It selects only the rows with TMAX since we have no interest in observations not containing TMAX.

Also the code keeps only records for stations remaining active in 2019 or later.

# Generated with the import button
inventory <- read_table2("ghcnd-inventory.txt", col_names = FALSE)
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_double(),
##   X3 = col_double(),
##   X4 = col_character(),
##   X5 = col_double(),
##   X6 = col_double()
## )
# Create column names 
colnames(inventory) = c("ID", "LAT", "LON","MEASURE","START_YR","END_YR")

# dplyr code to 
inventory %>% filter(MEASURE == "TMAX",END_YR >= 2019) %>% 
  select(ID, START_YR,END_YR) %>% 
  mutate(YR_LEN = END_YR - START_YR) -> inventory

head(inventory)
## # A tibble: 6 x 4
##   ID          START_YR END_YR YR_LEN
##   <chr>          <dbl>  <dbl>  <dbl>
## 1 AE000041196     1944   2019     75
## 2 AEM00041194     1983   2019     36
## 3 AEM00041217     1983   2019     36
## 4 AEM00041218     1994   2019     25
## 5 AFM00040938     1973   2019     46
## 6 AFM00040948     1966   2019     53

Resources

The next step is to use the inventory file as the basic data source and join the other useful data from the stations file. The first step is to drop the data we don’t from the stations file.

stations %>% select(ID, LAT, LON, ELEV, ST, NAME) -> stations

Now join the two files, keeping all records from the inventory and augmenting them with the stations data. The join is based on the variable ID, which is present in both files. For convenience, convert the character variable ST to a factor.

inventory %>% left_join(stations) %>% 
  mutate(ST = factor(ST)) -> resources
## Joining, by = "ID"
glimpse(resources)
## Observations: 13,825
## Variables: 9
## $ ID       <chr> "AE000041196", "AEM00041194", "AEM00041217", "AEM000412…
## $ START_YR <dbl> 1944, 1983, 1983, 1994, 1973, 1966, 1973, 1940, 1892, 1…
## $ END_YR   <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2…
## $ YR_LEN   <dbl> 75, 36, 36, 25, 46, 53, 46, 79, 127, 61, 133, 141, 139,…
## $ LAT      <dbl> 25.3330, 25.2550, 24.4330, 24.2620, 34.2100, 34.5660, 3…
## $ LON      <dbl> 55.5170, 55.3640, 54.6510, 55.6090, 62.2280, 69.2120, 6…
## $ ELEV     <dbl> 34.0, 10.4, 26.8, 264.9, 977.2, 1791.3, 1010.0, 24.0, 3…
## $ ST       <fct>   ,   ,   ,   ,   ,   ,   ,   ,   ,   ,   ,   ,   ,   ,…
## $ NAME     <chr> "SHARJAH INTER. AIRP           ", "DUBAI INTL          …
save(resources,file="resources.rdata")

Use Resources

Suppose we are interested in stations in the state of Washington. We can use the variable ST to select them.

wa_stations = resources %>% 
  filter(ST=="WA")
glimpse(wa_stations)
## Observations: 231
## Variables: 9
## $ ID       <chr> "USC00450008", "USC00450257", "USC00450456", "USC004504…
## $ START_YR <dbl> 1891, 1923, 1970, 1929, 1985, 1893, 1965, 1899, 1977, 1…
## $ END_YR   <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2…
## $ YR_LEN   <dbl> 128, 96, 49, 90, 34, 126, 54, 120, 42, 110, 126, 129, 9…
## $ LAT      <dbl> 46.9658, 48.2006, 47.7722, 45.7717, 48.7178, 48.9775, 4…
## $ LON      <dbl> -123.8292, -122.1281, -121.4819, -122.5286, -122.5114, …
## $ ELEV     <dbl> 3.0, 30.5, 234.7, 86.6, 4.6, 18.3, 559.9, 33.5, 345.6, …
## $ ST       <fct> WA, WA, WA, WA, WA, WA, WA, WA, WA, WA, WA, WA, WA, WA,…
## $ NAME     <chr> "ABERDEEN                      ", "ARLINGTON           …

After using View() in the console and sorting the data by name, we can see that the ID for the Olympia airport is USW00024227.

Station Reader

Here is a useful function to read the station data for a given value of ID.

read_station = function(ID){
  url = paste0("https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/",
                      ID,
                      ".csv")
  df = read_csv(url)
  return(df) 
}

# Test this to get the data for the Olympia airport
# First all of the data
OAP_Raw1 = read_station("USW00024227")
## Parsed with column specification:
## cols(
##   .default = col_logical(),
##   STATION = col_character(),
##   DATE = col_date(format = ""),
##   LATITUDE = col_double(),
##   LONGITUDE = col_double(),
##   ELEVATION = col_double(),
##   NAME = col_character(),
##   PRCP = col_double(),
##   PRCP_ATTRIBUTES = col_character(),
##   SNOW = col_double(),
##   SNOW_ATTRIBUTES = col_character(),
##   SNWD = col_double(),
##   SNWD_ATTRIBUTES = col_character(),
##   TMAX = col_double(),
##   TMAX_ATTRIBUTES = col_character(),
##   TMIN = col_double(),
##   TMIN_ATTRIBUTES = col_character()
## )
## See spec(...) for full column specifications.
## Warning: 318051 parsing failures.
##  row             col           expected actual                                                                                                file
## 2425 WT01_ATTRIBUTES 1/0/T/F/TRUE/FALSE    ,,X 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/USW00024227.csv'
## 2425 WT16_ATTRIBUTES 1/0/T/F/TRUE/FALSE    ,,X 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/USW00024227.csv'
## 2426 WT16_ATTRIBUTES 1/0/T/F/TRUE/FALSE    ,,X 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/USW00024227.csv'
## 2427 WT01_ATTRIBUTES 1/0/T/F/TRUE/FALSE    ,,X 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/USW00024227.csv'
## 2427 WT16_ATTRIBUTES 1/0/T/F/TRUE/FALSE    ,,X 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/USW00024227.csv'
## .... ............... .................. ...... ...................................................................................................
## See problems(...) for more details.
# Keep just the variables we want
OAP_Raw2 = OAP_Raw1 %>% select(STATION,DATE,NAME,PRCP,SNOW,TMAX,TMIN)

# Look at the data and think about units of measurement
summary(OAP_Raw2)
##    STATION               DATE                NAME          
##  Length:28751       Min.   :1941-05-13   Length:28751      
##  Class :character   1st Qu.:1961-01-15   Class :character  
##  Mode  :character   Median :1980-09-20   Mode  :character  
##                     Mean   :1980-09-20                     
##                     3rd Qu.:2000-05-25                     
##                     Max.   :2020-01-29                     
##                                                            
##       PRCP              SNOW              TMAX            TMIN        
##  Min.   :   0.00   Min.   :  0.000   Min.   :-78.0   Min.   :-222.00  
##  1st Qu.:   0.00   1st Qu.:  0.000   1st Qu.:100.0   1st Qu.:   6.00  
##  Median :   0.00   Median :  0.000   Median :150.0   Median :  44.00  
##  Mean   :  34.71   Mean   :  1.012   Mean   :158.5   Mean   :  43.36  
##  3rd Qu.:  36.00   3rd Qu.:  0.000   3rd Qu.:217.0   3rd Qu.:  83.00  
##  Max.   :1224.00   Max.   :361.000   Max.   :400.0   Max.   : 206.00  
##  NA's   :4         NA's   :5416      NA's   :12      NA's   :12