R Notebook

Traffic Fatalities Australia 2016

Data

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

dat<-read.csv("e:/shiny/trafficmortality/tf201610.csv",sep=",")

Some Initial Observations

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.3.2

ggplot(dat,aes(as.numeric(Age)))+geom_histogram(aes(y=..density..),col="black",fill="red")+facet_wrap(~Road_User)+xlab("Age")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Age Distribution of Fatalities By Road User

ggplot(dat,aes(as.numeric(Age)))+geom_histogram(aes(y=..density..),col="black",fill="red")+facet_wrap(~Road_User+Gender)+xlab("Age")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

library(stringr)
dat<-dat%>%
  mutate(date=as.Date(paste(Year,"-",str_pad(Month,2,pad="0"),"-",str_pad(Day,2,pad="0")),
                            format="%Y - %m -%d"))

dat_count<-dat%>%
  group_by(Year,Month)%>%
  count(date)
dat_ym<-dat_count%>%
  group_by(Year,Month)%>%
  summarise_each(funs(sum),n)
dticks<-seq(as.Date("1989/1/1"),by="month",length.out = 334)
plot(dticks,dat_ym$n,type="line",main="Monthly Traffic Deaths by Year",xlab="Year",ylab="Number",col="red")

## Warning in plot.xy(xy, type, ...): plot type 'line' will be truncated to
## first character

library(xts)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

library(dygraphs)
time_s<-xts(dat_ym$n,dticks)
dygraph(time_s)%>%dyRangeSelector()%>% 
      dyAxis("y",valueRange=c(0,300),label="Number")%>%
      dyAxis("x",valueRange=c(as.Date("2011-01-01"),Sys.Date()))%>%
  dyOptions(colors="red")

library(stringr)
dat_state<-dat%>%
  group_by(Year,Month,State)%>%
  count(date)
dat_yms<-dat_state%>%
  group_by(Year,Month,State)%>%
  summarise_each(funs(sum),n)
dat_yms<-dat_yms%>%
  mutate(date=as.Date(paste(Year,"-",str_pad(Month,2,pad="0"),"-","01"),format="%Y -%m -%d"))
dpl<-function(state){
ggplot(data=filter(dat_yms,State==state))+geom_line(aes(date,n),color="red")+
    geom_smooth(aes(date,n))+
    ylim(0,125)+
    ggtitle(state)}
lapply(levels(dat_yms$State),dpl)

## [[1]]

## `geom_smooth()` using method = 'loess'

## 
## [[2]]

## `geom_smooth()` using method = 'loess'

## 
## [[3]]

## `geom_smooth()` using method = 'loess'

## 
## [[4]]

## `geom_smooth()` using method = 'loess'

## 
## [[5]]

## `geom_smooth()` using method = 'loess'

## 
## [[6]]

## `geom_smooth()` using method = 'loess'

## 
## [[7]]

## `geom_smooth()` using method = 'loess'

## 
## [[8]]

## `geom_smooth()` using method = 'loess'

Time Series Decomposition

tmts<-ts(dat_ym$n,frequency = 12,start=c(1989,1))
plot(stl(tmts,"per"))

State Populations With Time

This data was derived from WolframAlpha queries and interpolation.

pop<-read.csv("e:/pandoc/auspop.csv",sep=",",header=FALSE)
names(pop)<-c("State","Year","Population")

ggplot(data=pop)+geom_line(aes(Year,Population),color="red")+facet_wrap(~State)

Mortality Rates

The preceding data was used for calculation of per capital traffic fatality rate.

dj<-inner_join(dat_yms,pop,by = c("Year", "State"))
dpf<-function(state){
ggplot(data=filter(dj,State==state))+geom_line(aes(date,100000*n/Population),color="red")+
    geom_smooth(aes(date,100000*n/Population))+
    ylab("Mortality rate (per 100000)")+
    ggtitle(state)}
lapply(levels(dat_yms$State),dpf)

## [[1]]

## `geom_smooth()` using method = 'loess'

## 
## [[2]]

## `geom_smooth()` using method = 'loess'

## 
## [[3]]

## `geom_smooth()` using method = 'loess'

## 
## [[4]]

## `geom_smooth()` using method = 'loess'

## 
## [[5]]

## `geom_smooth()` using method = 'loess'

## 
## [[6]]

## `geom_smooth()` using method = 'loess'

## 
## [[7]]

## `geom_smooth()` using method = 'loess'

## 
## [[8]]

## `geom_smooth()` using method = 'loess'

library(rethinking)
s2n<-function(state){switch(as.character(state),"QLD"=1,"NSW"=2,"ACT"=3,"VIC"=4,"TAS"=5,"SA"=6,"WA"=7,"NT"=8)}
dj<-mutate(dj,time=Year+Month/12,log_pop=log(Population))
dj$state<-lapply(dj$State,s2n)
dj$state<-as.integer(dj$state)
djdf<-data.frame(select(dj,time,log_pop,state,n))
m<-map2stan(
  alist(
n~dpois(lambda),
log(lambda)<-location[state]+log_pop+b*time+slope[state]*time,
location[state]~dnorm(0,1),
slope[state]~dnorm(0,1),
b~dnorm(0,1)
    ),
  data=djdf,
  iter=1000
)

## 
## SAMPLING FOR MODEL 'n ~ dpois(lambda)' NOW (CHAIN 1).
## 
## Chain 1, Iteration:   1 / 1000 [  0%]  (Warmup)
## Chain 1, Iteration: 100 / 1000 [ 10%]  (Warmup)
## Chain 1, Iteration: 200 / 1000 [ 20%]  (Warmup)
## Chain 1, Iteration: 300 / 1000 [ 30%]  (Warmup)
## Chain 1, Iteration: 400 / 1000 [ 40%]  (Warmup)
## Chain 1, Iteration: 500 / 1000 [ 50%]  (Warmup)
## Chain 1, Iteration: 501 / 1000 [ 50%]  (Sampling)
## Chain 1, Iteration: 600 / 1000 [ 60%]  (Sampling)
## Chain 1, Iteration: 700 / 1000 [ 70%]  (Sampling)
## Chain 1, Iteration: 800 / 1000 [ 80%]  (Sampling)
## Chain 1, Iteration: 900 / 1000 [ 90%]  (Sampling)
## Chain 1, Iteration: 1000 / 1000 [100%]  (Sampling)
##  Elapsed Time: 1.326 seconds (Warm-up)
##                57.161 seconds (Sampling)
##                58.487 seconds (Total)
## 
## 
## SAMPLING FOR MODEL 'n ~ dpois(lambda)' NOW (CHAIN 1).
## WARNING: No variance estimation is
##          performed for num_warmup < 20
## 
## 
## Chain 1, Iteration: 1 / 1 [100%]  (Sampling)
##  Elapsed Time: 0 seconds (Warm-up)
##                0.001 seconds (Sampling)
##                0.001 seconds (Total)
## 
## [ 50 / 500 ]
[ 100 / 500 ]
[ 150 / 500 ]
[ 200 / 500 ]
[ 250 / 500 ]
[ 300 / 500 ]
[ 350 / 500 ]
[ 400 / 500 ]
[ 450 / 500 ]
[ 500 / 500 ]

precis(m,depth=2)

## Warning in precis(m, depth = 2): There were 1 divergent iterations during sampling.
## Check the chains (trace plots, n_eff, Rhat) carefully to ensure they are valid.

##              Mean StdDev lower 0.89 upper 0.89 n_eff Rhat
## location[1]  0.00   0.00       0.00       0.00     4 1.62
## location[2]  0.00   0.00       0.00       0.00    11 1.04
## location[3]  0.00   0.00       0.00       0.00     8 1.12
## location[4]  0.00   0.00       0.00       0.00     4 1.47
## location[5]  0.00   0.00       0.00       0.00    27 1.01
## location[6]  0.00   0.00       0.00       0.00     9 1.12
## location[7]  0.00   0.00       0.00       0.00     5 1.35
## location[8]  0.00   0.00       0.00       0.00     4 1.51
## slope[1]     0.11   0.03       0.08       0.15     2 4.53
## slope[2]     0.11   0.03       0.08       0.15     2 4.53
## slope[3]     0.06   0.02       0.03       0.09     2 4.91
## slope[4]     0.11   0.03       0.08       0.15     2 4.53
## slope[5]     0.11   0.02       0.08       0.14     3 4.42
## slope[6]     0.11   0.03       0.08       0.15     2 4.52
## slope[7]     0.11   0.03       0.08       0.15     2 4.53
## slope[8]     0.11   0.02       0.08       0.14     2 4.97
## b           -0.12   0.03      -0.15      -0.09     2 4.53

plot(precis(m,depth=2))

## Warning in precis(m, depth = 2): There were 1 divergent iterations during sampling.
## Check the chains (trace plots, n_eff, Rhat) carefully to ensure they are valid.

m2<-map2stan(
  alist(
n~dpois(lambda),
log(lambda)<-location[state]+log_pop+b*time,

location[state]~dnorm(-1,1),
b~dnorm(0,1)
    ),
  data=djdf,
  iter=10000
)

## 
## SAMPLING FOR MODEL 'n ~ dpois(lambda)' NOW (CHAIN 1).
## 
## Chain 1, Iteration:    1 / 10000 [  0%]  (Warmup)
## Chain 1, Iteration: 1000 / 10000 [ 10%]  (Warmup)
## Chain 1, Iteration: 2000 / 10000 [ 20%]  (Warmup)
## Chain 1, Iteration: 3000 / 10000 [ 30%]  (Warmup)
## Chain 1, Iteration: 4000 / 10000 [ 40%]  (Warmup)
## Chain 1, Iteration: 5000 / 10000 [ 50%]  (Warmup)
## Chain 1, Iteration: 5001 / 10000 [ 50%]  (Sampling)
## Chain 1, Iteration: 6000 / 10000 [ 60%]  (Sampling)
## Chain 1, Iteration: 7000 / 10000 [ 70%]  (Sampling)
## Chain 1, Iteration: 8000 / 10000 [ 80%]  (Sampling)
## Chain 1, Iteration: 9000 / 10000 [ 90%]  (Sampling)
## Chain 1, Iteration: 10000 / 10000 [100%]  (Sampling)
##  Elapsed Time: 1115.18 seconds (Warm-up)
##                173.772 seconds (Sampling)
##                1288.95 seconds (Total)
## 
## 
## SAMPLING FOR MODEL 'n ~ dpois(lambda)' NOW (CHAIN 1).
## WARNING: No variance estimation is
##          performed for num_warmup < 20
## 
## 
## Chain 1, Iteration: 1 / 1 [100%]  (Sampling)
##  Elapsed Time: 0 seconds (Warm-up)
##                0.001 seconds (Sampling)
##                0.001 seconds (Total)

## Computing WAIC

## Constructing posterior predictions

## [ 500 / 5000 ]
[ 1000 / 5000 ]
[ 1500 / 5000 ]
[ 2000 / 5000 ]
[ 2500 / 5000 ]
[ 3000 / 5000 ]
[ 3500 / 5000 ]
[ 4000 / 5000 ]
[ 4500 / 5000 ]
[ 5000 / 5000 ]

precis(m2,depth=2)

##              Mean StdDev lower 0.89 upper 0.89 n_eff Rhat
## location[1]  5.29   0.32       4.76       5.80   301    1
## location[2]  5.16   0.32       4.62       5.66   302    1
## location[3]  4.91   0.32       4.35       5.40   306    1
## location[4]  5.12   0.32       4.56       5.61   302    1
## location[5]  5.46   0.32       4.95       6.00   302    1
## location[6]  5.33   0.32       4.78       5.83   301    1
## location[7]  5.40   0.32       4.87       5.91   302    1
## location[8]  6.30   0.32       5.80       6.85   303    1
## b           -0.01   0.00      -0.01      -0.01   301    1

plot(precis(m2,depth=2))

tse<-function(state,time){
  s<-extract.samples(m2)
  mean(exp(s$location[,s2n(state)]+s$b*time))
}
tseq<-function(state,time,q){
   s<-extract.samples(m2)
  qn<-quantile(exp(s$location[,s2n(state)]+s$b*time),q)
  qn[1]
  
}
tseq025<-function(state,time){aeq(state,time,0.025)}
tseq975<-function(state,time){aeq(state,time,0.975)}

t<-seq(1989,2016.8,0.1)
f1<-function(state){
tm<-seq(1989,2016.8,0.1)
er<-sapply(tm,tse,state=state)
}
plot(t,f1("QLD"),type="l",xlab="Year",ylab="Monthly Traffic Fatality Rate per Capita",xlim=c(1989,2025),ylim=c(0,3e-05))
lines(t,f1("NSW"),col="red")
lines(t,f1("VIC"),col="blue")
lines(t,f1("ACT"),col="red",lty=2)
lines(t,f1("TAS"),col="blue",lty=2)
lines(t,f1("SA"),col="green")
lines(t,f1("WA"),col="green",lty=2)
lines(t,f1("NT"),col="black",lty=2)
legend(2017,2e-05,c("QLD","NSW","VIC","ACT","TAS","SA","WA","NT"),
       lty=c(1,1,1,2,2,1,2,2),col=c("black","red",                        "blue","red","blue","green","green","black"))

sim<-extract.samples(m2,n=1000)
orntqld<-exp(sim$location[,8]+sim$b*1989)/exp(sim$location[,1]+sim$b*1989)
hist(orntqld)