rainfall.RData…load('rainfall.RData')
head(stations)
# A tibble: 6 × 9
Station Elevation Easting Northing Lat Long County
<chr> <int> <dbl> <dbl> <dbl> <dbl> <chr>
1 Athboy 87 270400 261700 53.60 -6.93 Meath
2 Foulksmills 71 284100 118400 52.30 -6.77 Wexford
3 Mullingar 112 241780 247765 53.47 -7.37 Westmeath
4 Portlaw 8 246600 115200 52.28 -7.31 Waterford
5 Rathdrum 131 319700 186000 52.91 -6.22 Wicklow
6 Strokestown 49 194500 279100 53.75 -8.10 Roscommon
# ... with 2 more variables: Abbreviation <chr>, Source <chr>
head(rain)
# A tibble: 6 × 4
Year Month Rainfall Station
<dbl> <ord> <dbl> <chr>
1 1850 Jan 169.0 Ardara
2 1851 Jan 236.4 Ardara
3 1852 Jan 249.7 Ardara
4 1853 Jan 209.1 Ardara
5 1854 Jan 188.5 Ardara
6 1855 Jan 32.3 Ardara
dplyrlibrary(dplyr); library(tidyverse)
rain %>% group_by(Station) %>%
summarise(mrain=mean(Rainfall)) -> rain_summary
head(rain_summary)
# A tibble: 6 × 2
Station mrain
<chr> <dbl>
1 Ardara 140.36753
2 Armagh 68.32096
3 Athboy 74.74356
4 Belfast 87.10995
5 Birr 70.83498
6 Cappoquinn 121.22455
%>% command acts as a ‘pipeline’.x %>% f(y) is the same as f(x,y).x %>% f1(y) %>% f2(z) is the same as f2(f1(x,y),z) but easier to read.rain %>% group_by(Month) %>%
summarise(mrain=mean(Rainfall)) -> rain_months
head(rain_months)
# A tibble: 6 × 2
Month mrain
<ord> <dbl>
1 Jan 112.64355
2 Feb 83.24975
3 Mar 79.53280
4 Apr 68.74165
5 May 71.31769
6 Jun 72.74104
summarise applies a summary function to each group and creates a new data frame with one entry for each group. Any R summary function can be used - eg median, sd, max or a user defined function.
library(ggplot2)
ggplot(aes(x=Month,y=mrain),data=rain_months) + geom_bar(stat='identity') + labs(y='Mean Rainfall')
rain %>% group_by(Year) %>%
summarise(total_rain=sum(Rainfall)) -> rain_years
ggplot(aes(x=Year,y=total_rain),data=rain_years) + geom_line(stat='identity') + labs(y='Mean Annual Rainfall')
filterrain %>% group_by(Year) %>%
filter(Station=='Strokestown') %>%
summarise(total_rain=sum(Rainfall)) -> rain_years_str
ggplot(aes(x=Year,y=total_rain),data=rain_years_str) + geom_line(stat='identity') +
labs(y='Mean Annual Rainfall') -> strokes
strokes
strokes + geom_smooth()
rain %>% group_by(Year, Station) %>%
summarise(total_rain=sum(Rainfall)) -> rain_years_st
ggplot(aes(x=Year,y=total_rain),data=rain_years_st) +
geom_smooth() + labs(y='Mean Annual Rainfall') +
facet_wrap(~Station,nrow=4) + theme_light()
rain %>% group_by(Month,Station) %>%
summarise(mean_rain=mean(Rainfall)) -> rain_season_station
head(rain_season_station)
Source: local data frame [6 x 3]
Groups: Month [1]
Month Station mean_rain
<ord> <chr> <dbl>
1 Jan Ardara 174.82606
2 Jan Armagh 74.57242
3 Jan Athboy 84.94759
4 Jan Belfast 101.20718
5 Jan Birr 79.92074
6 Jan Cappoquinn 153.97159
Note that with is implicit in the dplyr commands such as filter and summarise
reshape2 packagelibrary(reshape2)
rain_season_station %>% acast(Station~Month) %>% head
Jan Feb Mar Apr May Jun
Ardara 174.82606 126.82303 123.02000 98.79333 96.90727 105.24061
Armagh 74.57242 55.97182 56.48879 53.67030 59.23182 62.72939
Athboy 84.94759 62.62133 62.44944 58.97874 62.16260 68.11460
Belfast 101.20718 74.50206 73.10221 65.90492 69.23426 74.48525
Birr 79.92074 57.88501 58.42056 54.07187 60.25831 61.96445
Cappoquinn 153.97159 117.77099 110.02890 94.00365 95.81437 94.86357
Jul Aug Sep Oct Nov Dec
Ardara 123.70485 145.24788 152.80727 174.44788 176.45030 186.14182
Armagh 72.50636 81.92182 69.02576 80.94242 73.73121 79.05939
Athboy 76.28662 88.84710 76.35077 88.14627 80.95767 87.05997
Belfast 87.70003 102.44499 87.11622 106.35394 100.31760 102.95073
Birr 75.10084 86.75669 72.21722 83.80810 77.87266 81.74334
Cappoquinn 104.09372 125.42245 116.04466 146.70658 141.10012 154.87402
library(reshape2)
rain_season_station %>% acast(Station~Month) %>% heatmap(Colv=NA)
%>% is useful hereThis code takes the rain data frame and runs through the steps to aggregated, reshapeand compute the heat map:
rain %>% group_by(Month,Station) %>% summarise(mean_rain=mean(Rainfall)) %>%
acast(Station~Month) %>%
heatmap(Colv=NA)
So does this code, in ‘traditional’ R format:
heatmap(acast(summarise(group_by(rain,Month,Station),
mean_rain=mean(Rainfall)),Station~Month),Colv=NA)
Which do you think is clearer?
rain %>% group_by(Month,Station) %>%
summarise(mean_rain=mean(Rainfall),sd_rain=sd(Rainfall)) -> rain_mnsd
head(rain_mnsd)
Source: local data frame [6 x 4]
Groups: Month [1]
Month Station mean_rain sd_rain
<ord> <chr> <dbl> <dbl>
1 Jan Ardara 174.82606 64.59834
2 Jan Armagh 74.57242 28.91264
3 Jan Athboy 84.94759 32.84887
4 Jan Belfast 101.20718 40.60078
5 Jan Birr 79.92074 33.74263
6 Jan Cappoquinn 153.97159 66.33995
mutate - Create new variables in a data framerain %>% group_by(Month) %>%
summarise(mean_rain=mean(Rainfall),sd_rain=sd(Rainfall)) %>%
mutate(cv_rain=100 * sd_rain/mean_rain) -> rain_mnsdcv
head(rain_mnsdcv)
# A tibble: 6 × 4
Month mean_rain sd_rain cv_rain
<ord> <dbl> <dbl> <dbl>
1 Jan 112.64355 57.56599 51.10456
2 Feb 83.24975 51.45153 61.80382
3 Mar 79.53280 44.33714 55.74699
4 Apr 68.74165 36.39073 52.93841
5 May 71.31769 37.20539 52.16853
6 Jun 72.74104 40.92818 56.26560
ggplot(aes(x=Month,y=cv_rain), data=rain_mnsdcv) + geom_bar(stat='identity') + labs(y='Coefficient of Variation')
ggplot(aes(x=Month,y=sd_rain), data=rain_mnsdcv) + geom_bar(stat='identity',fill='dodgerblue') + labs(y='Standard Deviation')
Source: http://aplus.com/a/instachaz-painfully-accurate-graphs-about-life
rain %>% group_by(Year,Month) %>% summarise(rf=sum(Rainfall)) %>% ungroup %>%
mutate(moyr = 1850 + row_number()/12) %>% filter(Year <= 1870) -> monthly_total
ggplot(aes(x=moyr,y=rf),data=monthly_total) + geom_point() +
geom_line(col='indianred') + labs(x='Year',y='Rainfall') +
geom_smooth()
ts - A time series object that is useful for some functions…
monthly_total$rf %>% ts(freq=12,start=1850) -> rain_ts # Time serioes object
rain_ts %>% window(c(1850,1),c(1853,12)) # window from jan 1850 - dec 1853
Jan Feb Mar Apr May Jun Jul Aug Sep Oct
1850 2836.3 2158.9 964.1 3457.2 1492.1 1362.4 2584.8 1906.4 1763.1 1567.8
1851 4875.5 1379.9 1997.9 1368.1 1124.2 2537.1 2258.3 2416.2 1234.5 2834.2
1852 4036.1 2311.2 1116.0 1110.0 1754.6 4484.9 1716.0 2752.0 1433.1 2217.7
1853 3618.6 1073.9 2088.0 1881.9 829.1 1936.7 2512.3 2072.0 1504.1 4627.6
Nov Dec
1850 2828.5 2600.1
1851 1134.3 1719.2
1852 5581.2 5205.9
1853 2651.1 1209.3
Note c(1850,1) is January 1850 etc. when using functions like window
monthplot functionrain_ts %>% window(c(1850,1),c(1869,12)) %>%
monthplot(col='dodgerblue',col.base='indianred',lwd.base=3)
rain_ts %>%
monthplot(col='dodgerblue',col.base='indianred',lwd.base=3)
View data as trend + seasonal + random components:
rain_ts %>%
stl(s.window='periodic') -> rain_decomp
rain_decomp
seasonal trend remainder
Jan 1850 771.4454 2232.491 -167.6364
Feb 1850 -472.9187 2223.735 408.0838
Mar 1850 -224.4634 2214.979 -1026.4155
Apr 1850 -413.7749 2210.759 1660.2163
May 1850 -401.4005 2206.538 -313.0379
Jun 1850 -328.9042 2207.861 -516.5569
Jul 1850 -314.2940 2209.184 689.9102
R. B. Cleveland, W. S. Cleveland, J.E. McRae, and I. Terpenning (1990) STL: A Seasonal-Trend Decomposition Procedure Based on Loess. Journal of Official Statistics, v6, pp3–73.
rain_ts %>% window(c(1850,1),c(1859,12)) %>%
stl(s.window='periodic') %>% plot
rain_ts %>% stl(s.window='periodic') %>% plot
rain_ts %>% stl(s.window='periodic',t.window=361) %>% plot
stl objecttime.series componentrain_ts %>% stl(s.window='periodic',t.window=361) -> rain_decomp2
rain_decomp2$time.series[,3] %>% window(c(1850,1),c(1853,12))
Jan Feb Mar Apr May
1850 -48.66935 518.57705 -924.39356 1755.36327 -224.77064
1851 1982.50372 -268.44987 101.37951 -341.76365 -600.69757
1852 1135.07680 654.82320 -788.54742 -607.89058 21.67550
1853 709.54987 -590.50373 175.37368 155.87854 -912.00735
Jun Jul Aug Sep Oct
1850 -426.21127 782.33355 -372.07972 -307.32110 -1039.27185
1851 740.46180 447.80663 129.69335 -843.94803 219.10123
1852 2680.23487 -102.52030 457.46643 -653.37495 -405.42570
1853 123.80005 685.49290 -230.87235 -590.76571 1996.03157
Nov Dec
1850 433.67692 9.66413
1851 -1268.55001 -879.26280
1852 3170.32307 2599.41028
1853 231.72836 -1405.73640
dplyr and pipeline-based commands, and related functionsts objects for time and related functionsmonthplot etc.