library(tidyverse)
## ── Attaching packages ──────────────────
## ✔ ggplot2 2.2.1.9000     ✔ purrr   0.2.4     
## ✔ tibble  1.4.2          ✔ dplyr   0.7.4     
## ✔ tidyr   0.8.0          ✔ stringr 1.3.0     
## ✔ readr   1.1.1          ✔ forcats 0.3.0
## ── Conflicts ── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
olywthr <- read_csv("~/Dropbox/RProjects/Oly Weather/olyw3.csv", col_types = cols(DATE = col_date(format = "%Y%m%d"),SNOW = col_double()))
str(olywthr)
## Classes 'tbl_df', 'tbl' and 'data.frame':    52599 obs. of  6 variables:
##  $ STATION_NAME: chr  "OLYMPIA PRIEST PT PA WA US" "OLYMPIA PRIEST PT PA WA US" "OLYMPIA PRIEST PT PA WA US" "OLYMPIA PRIEST PT PA WA US" ...
##  $ DATE        : Date, format: "1877-07-01" "1877-07-02" ...
##  $ PRCP        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SNOW        : num  -9999 -9999 -9999 -9999 -9999 ...
##  $ TMAX        : int  63 63 67 67 71 74 80 88 81 70 ...
##  $ TMIN        : int  48 53 45 45 43 49 49 50 57 57 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 6
##   .. ..$ STATION_NAME: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ DATE        :List of 1
##   .. .. ..$ format: chr "%Y%m%d"
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ PRCP        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ SNOW        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ TMAX        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ TMIN        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
olywthr %>% mutate(TMAX = ifelse(TMAX == -9999,NA,TMAX),
       TMIN = ifelse(TMIN == -9999,NA,TMIN),
SNOW = ifelse(SNOW == -9999,0,SNOW), 
PRCP = ifelse(PRCP == -9999,0,PRCP), 
       yr = year(DATE),
       mo = month(DATE),
       dy = day(DATE)) %>% 
filter(STATION_NAME == "OLYMPIA AIRPORT WA US" | yr < 1948) -> olywthr 
olywthr %>% filter(complete.cases(olywthr)) -> olywthr
save(olywthr,file="olywthr.rdata")
summary(olywthr)
##  STATION_NAME            DATE                 PRCP       
##  Length:49316       Min.   :1877-07-01   Min.   :0.0000  
##  Class :character   1st Qu.:1913-05-10   1st Qu.:0.0000  
##  Mode  :character   Median :1949-12-24   Median :0.0000  
##                     Mean   :1948-11-12   Mean   :0.1409  
##                     3rd Qu.:1983-09-26   3rd Qu.:0.1400  
##                     Max.   :2017-07-11   Max.   :4.8200  
##       SNOW               TMAX             TMIN             yr      
##  Min.   : 0.00000   Min.   : 15.00   Min.   :-8.00   Min.   :1877  
##  1st Qu.: 0.00000   1st Qu.: 50.00   1st Qu.:34.00   1st Qu.:1913  
##  Median : 0.00000   Median : 59.00   Median :41.00   Median :1949  
##  Mean   : 0.02647   Mean   : 60.64   Mean   :40.42   Mean   :1948  
##  3rd Qu.: 0.00000   3rd Qu.: 71.00   3rd Qu.:47.00   3rd Qu.:1983  
##  Max.   :14.20000   Max.   :104.00   Max.   :76.00   Max.   :2017  
##        mo               dy       
##  Min.   : 1.000   Min.   : 1.00  
##  1st Qu.: 4.000   1st Qu.: 8.00  
##  Median : 7.000   Median :16.00  
##  Mean   : 6.516   Mean   :15.74  
##  3rd Qu.:10.000   3rd Qu.:23.00  
##  Max.   :12.000   Max.   :31.00
olywthr %>% filter(mo==12) %>% 
          group_by(yr) %>% 
          summarize(mmax = mean(TMAX,na.rm=TRUE),
                    mmin = mean(TMIN,na.rm=TRUE)) %>%  
  ungroup() -> december

december
## # A tibble: 135 x 3
##       yr  mmax  mmin
##    <dbl> <dbl> <dbl>
##  1 1877.  46.5  37.9
##  2 1878.  44.2  35.8
##  3 1879.  42.8  32.9
##  4 1880.  44.5  34.6
##  5 1881.  46.9  37.1
##  6 1882.  46.6  38.3
##  7 1883.  45.9  35.4
##  8 1884.  39.5  28.1
##  9 1885.  48.7  37.8
## 10 1886.  49.4  40.1
## # ... with 125 more rows
december %>% ggplot(aes(yr,mmax)) +
  geom_point(col="red") + 
  geom_smooth(col="red") +
  geom_point(aes(x=yr,y=mmin),col="blue") +
  geom_smooth(aes(x=yr,y=mmin),col="blue")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

olywthr %>% 
  ggplot(aes(x=TMAX)) +
  geom_histogram() +
  facet_wrap(~mo)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

tapply(olywthr$TMAX,olywthr$mo,summary)
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   40.00   45.00   44.44   49.00   74.00 
## 
## $`2`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.00   44.00   49.00   48.36   53.00   73.00 
## 
## $`3`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   31.00   49.00   53.00   53.49   57.00   79.00 
## 
## $`4`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   39.00   54.00   58.00   59.63   64.00   90.00 
## 
## $`5`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   46.00   60.00   65.00   66.54   72.00   96.00 
## 
## $`6`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   50.00   65.00   70.00   71.61   77.00  101.00 
## 
## $`7`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   56.00   71.00   77.00   77.54   83.00  104.00 
## 
## $`8`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   56.00   72.00   77.00   77.34   82.00  104.00 
## 
## $`9`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   50.00   65.00   70.00   70.86   76.00   98.00 
## 
## $`10`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    33.0    56.0    60.0    60.5    65.0    90.0 
## 
## $`11`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   24.00   47.00   51.00   50.93   55.00   74.00 
## 
## $`12`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.00   42.00   46.00   45.46   50.00   70.00
olywthr %>% 
  group_by(mo,dy) %>%
  summarize(prain = mean(PRCP > 0,na.rm=TRUE)) %>% 
  ungroup() -> rainy
str(rainy)
## Classes 'tbl_df', 'tbl' and 'data.frame':    366 obs. of  3 variables:
##  $ mo   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ dy   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ prain: num  0.647 0.607 0.681 0.664 0.654 ...
summary(rainy)
##        mo               dy            prain       
##  Min.   : 1.000   Min.   : 1.00   Min.   :0.0438  
##  1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:0.2844  
##  Median : 7.000   Median :16.00   Median :0.4630  
##  Mean   : 6.514   Mean   :15.76   Mean   :0.4344  
##  3rd Qu.: 9.750   3rd Qu.:23.00   3rd Qu.:0.5993  
##  Max.   :12.000   Max.   :31.00   Max.   :0.7353
rainy %>% ggplot(aes(x=prain)) +
  geom_histogram() +
  facet_wrap(~mo)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

rainy %>% ggplot(aes(x=prain)) +
  geom_density() +
  facet_wrap(~mo)

Prediction

Let’s create a weather prediction for 2018 based on average values for each day.

olywthr %>% 
  group_by(mo,dy) %>% 
  summarize(prain = mean(PRCP > 0),
            dmax = mean(TMAX),
            dmin = mean(TMIN)) %>% 
             
    ungroup() %>% mutate(date =make_date(2018,mo,dy))-> cal18
cal18 %>% ggplot(aes(x=date,y=dmax)) +
  geom_point() +
  ggtitle("Average Daily Maximum Temperature") +
  theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 1 rows containing missing values (geom_point).

cal18 %>% 
  filter(mo>8 & mo < 11) %>% 
  ggplot(aes(x=date,y=dmax)) +
  geom_point() +
  ggtitle("Average Daily Maximum Temperature") +
  theme(plot.title = element_text(hjust = 0.5))