library(tidyverse)
## ── Attaching packages ───────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.3
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ──────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
read_station = function(ID){
url = paste0("https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/",
ID,
".csv")
df = read_csv(url)
return(df)
}
# Test this to get the data for the Olympia airport
# First all of the data
OAP_Raw1 = read_station("USW00024227")
## Parsed with column specification:
## cols(
## .default = col_logical(),
## STATION = col_character(),
## DATE = col_date(format = ""),
## LATITUDE = col_double(),
## LONGITUDE = col_double(),
## ELEVATION = col_double(),
## NAME = col_character(),
## PRCP = col_double(),
## PRCP_ATTRIBUTES = col_character(),
## SNOW = col_double(),
## SNOW_ATTRIBUTES = col_character(),
## SNWD = col_double(),
## SNWD_ATTRIBUTES = col_character(),
## TMAX = col_double(),
## TMAX_ATTRIBUTES = col_character(),
## TMIN = col_double(),
## TMIN_ATTRIBUTES = col_character()
## )
## See spec(...) for full column specifications.
## Warning: 318130 parsing failures.
## row col expected actual file
## 2425 WT01_ATTRIBUTES 1/0/T/F/TRUE/FALSE ,,X 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/USW00024227.csv'
## 2425 WT16_ATTRIBUTES 1/0/T/F/TRUE/FALSE ,,X 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/USW00024227.csv'
## 2426 WT16_ATTRIBUTES 1/0/T/F/TRUE/FALSE ,,X 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/USW00024227.csv'
## 2427 WT01_ATTRIBUTES 1/0/T/F/TRUE/FALSE ,,X 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/USW00024227.csv'
## 2427 WT16_ATTRIBUTES 1/0/T/F/TRUE/FALSE ,,X 'https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/USW00024227.csv'
## .... ............... .................. ...... ...................................................................................................
## See problems(...) for more details.
# Keep just the variables we want
OAP_Raw2 = OAP_Raw1 %>% select(STATION,DATE,NAME,PRCP,SNOW,TMAX,TMIN)
# Look at the data and think about units of measurement
summary(OAP_Raw2)
## STATION DATE NAME PRCP
## Length:28756 Min. :1941-05-13 Length:28756 Min. : 0.00
## Class :character 1st Qu.:1961-01-16 Class :character 1st Qu.: 0.00
## Mode :character Median :1980-09-22 Mode :character Median : 0.00
## Mean :1980-09-22 Mean : 34.73
## 3rd Qu.:2000-05-29 3rd Qu.: 36.00
## Max. :2020-02-03 Max. :1224.00
## NA's :4
## SNOW TMAX TMIN
## Min. : 0.000 Min. :-78.0 Min. :-222.00
## 1st Qu.: 0.000 1st Qu.:100.0 1st Qu.: 6.00
## Median : 0.000 Median :150.0 Median : 44.00
## Mean : 1.012 Mean :158.5 Mean : 43.36
## 3rd Qu.: 0.000 3rd Qu.:217.0 3rd Qu.: 83.00
## Max. :361.000 Max. :400.0 Max. : 206.00
## NA's :5416 NA's :12 NA's :12
Some observations:
The temperatures are in degrees celsius multiplied by 10.
The PRCP and SNOW values are in some metric units. To convert to inches we need to divide by 254.
The variable DATE is a legitimate date and we can use the lubridate functions to get its components.
The variable SNOW has a lot of NA values. We need to look at it carefully.
With these observations in mind, we can get OAP_Raw3, a third version of our data.
OAP_Raw2 %>% mutate(TMAX = .1 * 1.8 * TMAX + 32,
TMIN = .1 * 1.8 * TMIN + 32,
PRCP = PRCP/254,
SNOW = SNOW/254,
yr = year(DATE),
mo = month(DATE),
dy = day(DATE)) -> OAP_Raw3
summary(OAP_Raw3)
## STATION DATE NAME PRCP
## Length:28756 Min. :1941-05-13 Length:28756 Min. :0.0000
## Class :character 1st Qu.:1961-01-16 Class :character 1st Qu.:0.0000
## Mode :character Median :1980-09-22 Mode :character Median :0.0000
## Mean :1980-09-22 Mean :0.1367
## 3rd Qu.:2000-05-29 3rd Qu.:0.1417
## Max. :2020-02-03 Max. :4.8189
## NA's :4
## SNOW TMAX TMIN yr
## Min. :0.000 Min. : 17.96 Min. :-7.96 Min. :1941
## 1st Qu.:0.000 1st Qu.: 50.00 1st Qu.:33.08 1st Qu.:1961
## Median :0.000 Median : 59.00 Median :39.92 Median :1980
## Mean :0.004 Mean : 60.52 Mean :39.80 Mean :1980
## 3rd Qu.:0.000 3rd Qu.: 71.06 3rd Qu.:46.94 3rd Qu.:2000
## Max. :1.421 Max. :104.00 Max. :69.08 Max. :2020
## NA's :5416 NA's :12 NA's :12
## mo dy
## Min. : 1.000 Min. : 1.00
## 1st Qu.: 4.000 1st Qu.: 8.00
## Median : 7.000 Median :16.00
## Mean : 6.534 Mean :15.73
## 3rd Qu.:10.000 3rd Qu.:23.00
## Max. :12.000 Max. :31.00
##
Observations:
All of the meteorological variables have some missing data, but the amount of missing snow data is larger than the others. In some cases data might be missing as a sloppy alternative to zero. We need to see if this replacement would solve our problems. First, let’s look at the SNOW variable.
bad_snow = OAP_Raw3 %>%
select(SNOW,DATE) %>%
filter(is.na(SNOW))
summary(bad_snow)
## SNOW DATE
## Min. : NA Min. :1996-02-02
## 1st Qu.: NA 1st Qu.:2000-07-06
## Median : NA Median :2004-08-21
## Mean :NaN Mean :2004-07-09
## 3rd Qu.: NA 3rd Qu.:2008-05-06
## Max. : NA Max. :2020-02-03
## NA's :5416
It looks like the bad snow data began in 1996. There is so much bad data that we should see if all of the recent snow data is bad. Go back to the complete data and check for positive snow values after 1996.
recent_snow = OAP_Raw3 %>% filter(yr > 1996 & SNOW > 0)
summary(recent_snow)
## STATION DATE NAME PRCP
## Length:8 Min. :2012-01-15 Length:8 Min. :0.09843
## Class :character 1st Qu.:2012-01-17 Class :character 1st Qu.:0.29331
## Mode :character Median :2012-02-15 Mode :character Median :0.36024
## Mean :2012-09-10 Mean :0.51624
## 3rd Qu.:2013-03-31 3rd Qu.:0.74606
## Max. :2014-02-09 Max. :1.09843
## SNOW TMAX TMIN yr
## Min. :0.0315 Min. :30.92 Min. :17.96 Min. :2012
## 1st Qu.:0.0935 1st Qu.:33.48 1st Qu.:24.44 1st Qu.:2012
## Median :0.1594 Median :35.51 Median :30.02 Median :2012
## Mean :0.2682 Mean :36.48 Mean :27.18 Mean :2012
## 3rd Qu.:0.2303 3rd Qu.:38.44 3rd Qu.:30.65 3rd Qu.:2012
## Max. :1.0984 Max. :46.04 Max. :32.00 Max. :2014
## mo dy
## Min. : 1.000 Min. : 8.00
## 1st Qu.: 1.000 1st Qu.:12.00
## Median : 1.500 Median :15.50
## Mean : 2.875 Mean :14.38
## 3rd Qu.: 2.250 3rd Qu.:17.25
## Max. :12.000 Max. :19.00
This looks bad. We know that there has been snow in Olympia after 2014. The variable SNOW needs to be dropped.
What about the missing TMAX, TMIN, and PRCP? Look at when they happened.
bad_data = OAP_Raw3 %>%
filter(is.na(TMAX) | is.na(TMIN) | is.na(PRCP))
summary(bad_data)
## STATION DATE NAME PRCP
## Length:15 Min. :1996-01-24 Length:15 Min. :0.00000
## Class :character 1st Qu.:1997-02-15 Class :character 1st Qu.:0.00000
## Mode :character Median :1997-04-13 Mode :character Median :0.00000
## Mean :1998-08-26 Mean :0.07802
## 3rd Qu.:1997-05-08 3rd Qu.:0.05906
## Max. :2020-02-03 Max. :0.38976
## NA's :4
## SNOW TMAX TMIN yr mo
## Min. :0 Min. :39.02 Min. :28.04 Min. :1996 Min. : 1.000
## 1st Qu.:0 1st Qu.:50.00 1st Qu.:28.04 1st Qu.:1996 1st Qu.: 4.000
## Median :0 Median :60.98 Median :28.04 Median :1997 Median : 5.000
## Mean :0 Mean :55.64 Mean :29.72 Mean :1998 Mean : 5.267
## 3rd Qu.:0 3rd Qu.:63.95 3rd Qu.:30.56 3rd Qu.:1997 3rd Qu.: 5.000
## Max. :0 Max. :66.92 Max. :33.08 Max. :2020 Max. :12.000
## NA's :13 NA's :12 NA's :12
## dy
## Min. : 3.0
## 1st Qu.: 7.0
## Median :12.0
## Mean :12.4
## 3rd Qu.:14.0
## Max. :27.0
##
# View(bad_data) Done in console
Since the amount of missing data is fairly small, we can safely drop it. If we were doing daily time-series analysis, we could do something more complicated and replace the missing data using the remainder of the data.
Now we can produce the final data file.
OAP = OAP_Raw3 %>%
select(-SNOW) %>%
na.omit()
summary(OAP)
## STATION DATE NAME PRCP
## Length:28741 Min. :1941-05-13 Length:28741 Min. :0.0000
## Class :character 1st Qu.:1961-01-13 Class :character 1st Qu.:0.0000
## Mode :character Median :1980-09-15 Mode :character Median :0.0000
## Mean :1980-09-19 Mean :0.1368
## 3rd Qu.:2000-06-01 3rd Qu.:0.1417
## Max. :2020-02-02 Max. :4.8189
## TMAX TMIN yr mo
## Min. : 17.96 Min. :-7.96 Min. :1941 Min. : 1.000
## 1st Qu.: 50.00 1st Qu.:33.08 1st Qu.:1961 1st Qu.: 4.000
## Median : 59.00 Median :39.92 Median :1980 Median : 7.000
## Mean : 60.52 Mean :39.81 Mean :1980 Mean : 6.535
## 3rd Qu.: 71.06 3rd Qu.:46.94 3rd Qu.:2000 3rd Qu.:10.000
## Max. :104.00 Max. :69.08 Max. :2020 Max. :12.000
## dy
## Min. : 1.00
## 1st Qu.: 8.00
## Median :16.00
## Mean :15.73
## 3rd Qu.:23.00
## Max. :31.00