Student Details

Data

# Load your data and prepare for visualisation
Crashes1 <- read.csv("D:/Data Visualisation/Assignment2/Crashes1.csv")
head(Crashes1)
##   ACCIDENT_DATE ALCOHOLTIME DAY_OF_WEEK       LIGHT_CONDITION
## 1    26/05/2012          No    Saturday                   Day
## 2     6/06/2012          No   Wednesday                   Day
## 3    24/05/2012          No    Thursday                   Day
## 4     6/06/2012          No   Wednesday             Dusk/Dawn
## 5    22/05/2012          No     Tuesday                   Day
## 6     6/06/2012         Yes   Wednesday Dark Street lights on
##                  SEVERITY SPEED_ZONE LONGITUDE  LATITUDE
## 1   Other injury accident   60 km/hr  145.0607 -37.81037
## 2   Other injury accident   60 km/hr  144.9912 -37.88316
## 3   Other injury accident   40 km/hr  145.0095 -37.82695
## 4 Serious injury accident   50 km/hr  145.1346 -37.84155
## 5 Serious injury accident   80 km/hr  145.2946 -37.88860
## 6 Serious injury accident   70 km/hr  146.2771 -38.18324
##                      REGION_NAME INJ_OR_FATAL FATALITY ALCOHOL_RELATED
## 1 METROPOLITAN SOUTH EAST REGION            2        0              No
## 2 METROPOLITAN NORTH WEST REGION            1        0              No
## 3 METROPOLITAN NORTH WEST REGION            1        0              No
## 4 METROPOLITAN SOUTH EAST REGION            1        0              No
## 5 METROPOLITAN SOUTH EAST REGION            1        0              No
## 6                 EASTERN REGION            1        0              No
##   STAT_DIV_NAME
## 1         Metro
## 2         Metro
## 3         Metro
## 4         Metro
## 5         Metro
## 6       Country
#Filter NA values
Crashes1 <-filter(Crashes1,REGION_NAME!="NULL")
Crashes1 <- filter(Crashes1, SEVERITY!="Non injury accident")


#Order bars in the facet best possible manner
counts <- Crashes1 %>% group_by(REGION_NAME) %>% summarise(count=n())
counts <- as.data.frame(counts)
counts <- counts[order (-counts$count),]
Crashes1$REGION_NAME <- Crashes1$REGION_NAME %>% factor(levels=counts$REGION_NAME)
levels(Crashes1$REGION_NAME)
## [1] "METROPOLITAN SOUTH EAST REGION" "METROPOLITAN NORTH WEST REGION"
## [3] "SOUTH WESTERN REGION"           "EASTERN REGION"                
## [5] "NORTHERN REGION"                "NORTH EASTERN REGION"          
## [7] "WESTERN REGION"                 ""
Crashes1$DAY_OF_WEEK %>% levels
## [1] ""          "Friday"    "Monday"    "Saturday"  "Sunday"    "Thursday" 
## [7] "Tuesday"   "Wednesday"
Crashes1$DAY_OF_WEEK <- factor(Crashes1$DAY_OF_WEEK,levels=c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))
Crashes1$DAY_OF_WEEK %>% levels
## [1] "Monday"    "Tuesday"   "Wednesday" "Thursday"  "Friday"    "Saturday" 
## [7] "Sunday"
Crashes1$DAY_OF_WEEK <- Crashes1$DAY_OF_WEEK %>% fct_relevel('Mon','Tue','Wed','Thu','Fri','Sat','Sun')
Crashes1$DAY_OF_WEEK %>% levels
## [1] "Monday"    "Tuesday"   "Wednesday" "Thursday"  "Friday"    "Saturday" 
## [7] "Sunday"
Crashes1$ACCIDENT_DATE %>% class
## [1] "factor"
Crashes1$ACCIDENT_DATE %>% head()
## [1] 26/05/2012 6/06/2012  24/05/2012 6/06/2012  22/05/2012 6/06/2012 
## 2040 Levels: 1/01/2012 1/01/2013 1/01/2014 1/01/2015 1/01/2016 ... 9/12/2016
Crashes1$ACCIDENT_DATE <- Crashes1$ACCIDENT_DATE %>% dmy
Crashes1$ACCIDENT_DATE %>%  class
## [1] "Date"
Crashes1$ACCIDENT_DATE %>% year%>% head
## [1] 2012 2012 2012 2012 2012 2012
Crashes1$YEAR <- format(Crashes1$ACCIDENT_DATE, format='%Y')
Crashes1$MONTH <- format(Crashes1$ACCIDENT_DATE, format='%m')

crosstab<-table(Crashes1$YEAR,Crashes1$MONTH, dnn = c( "YEAR","MONTH"))
crosstab
##       MONTH
## YEAR     01   02   03   04   05   06   07   08   09   10   11   12
##   2012 1055 1220 1222 1195 1257 1073 1096 1159 1030 1158 1130 1094
##   2013 1009 1173 1225 1159 1239 1157 1092 1106 1108 1203 1180 1166
##   2014 1089 1139 1334 1175 1284 1154 1111 1167 1080 1276 1245 1177
##   2015 1168 1149 1348 1238 1294 1070 1201 1064 1053 1288 1190 1347
##   2016 1124 1288 1290 1282 1305 1145 1180 1177 1011 1142 1144 1100
##   2017  906 1000 1124  919  900  784  479    2    0    0    0    0
crosstab<- data.frame(crosstab) #Convert proportion table to df 
str(crosstab) #Data frame summary
## 'data.frame':    72 obs. of  3 variables:
##  $ YEAR : Factor w/ 6 levels "2012","2013",..: 1 2 3 4 5 6 1 2 3 4 ...
##  $ MONTH: Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 2 2 2 2 ...
##  $ Freq : int  1055 1009 1089 1168 1124 906 1220 1173 1139 1149 ...
colnames(crosstab) <- c( "YEAR","MONTH", "COUNT")
head(crosstab)
##   YEAR MONTH COUNT
## 1 2012    01  1055
## 2 2013    01  1009
## 3 2014    01  1089
## 4 2015    01  1168
## 5 2016    01  1124
## 6 2017    01   906
crosstab3<-table(Crashes1$YEAR,Crashes1$DAY_OF_WEEK, dnn = c( "YEAR","DAY_OF_WEEK"))
crosstab3
##       DAY_OF_WEEK
## YEAR   Monday Tuesday Wednesday Thursday Friday Saturday Sunday
##   2012   1911    1889      2017     2168   2145     1833   1650
##   2013   1900    2050      2057     2088   2160     1872   1631
##   2014   1899    1952      2167     2143   2181     1876   1861
##   2015   1941    2069      2065     2250   2224     1655   1879
##   2016   1975    2025      2145     2047   2219     1524   1815
##   2017    914     943       838      935    916      550    804
crosstab3<- data.frame(crosstab3) #Convert proportion table to df 
str(crosstab3) #Data frame summary
## 'data.frame':    42 obs. of  3 variables:
##  $ YEAR       : Factor w/ 6 levels "2012","2013",..: 1 2 3 4 5 6 1 2 3 4 ...
##  $ DAY_OF_WEEK: Factor w/ 7 levels "Monday","Tuesday",..: 1 1 1 1 1 1 2 2 2 2 ...
##  $ Freq       : int  1911 1900 1899 1941 1975 914 1889 2050 1952 2069 ...
colnames(crosstab3) <- c( "YEAR","DAY_OF_WEEK", "COUNT")
head(crosstab3)
##   YEAR DAY_OF_WEEK COUNT
## 1 2012      Monday  1911
## 2 2013      Monday  1900
## 3 2014      Monday  1899
## 4 2015      Monday  1941
## 5 2016      Monday  1975
## 6 2017      Monday   914
Crashes_sum <-Crashes1 %>% group_by(LIGHT_CONDITION) %>% summarise(count = n())
Crashes_sum$Proportion <- Crashes_sum$count/nrow(Crashes1)
Crashes_sum$LIGHT_CONDITION <- Crashes_sum$LIGHT_CONDITION %>% factor(levels = Crashes_sum$LIGHT_CONDITION[order(-Crashes_sum$count)])

Visualisation

# Visualise Your Data

p4 <-ggplot(Crashes1, aes(x=REGION_NAME,fill=SEVERITY)) + geom_bar(aes(y=(..count..)/sum(..count..)))+coord_flip()+
  labs(title="Metropolitan region have more crashes with injuries(Jan2012-Jul2017)",
       y="Proportion of crashes",
       x="Region names",fill="Severity")+scale_fill_brewer(palette = "Accent")

p5<-ggplot(crosstab, aes(MONTH,COUNT, group=YEAR, colour=YEAR)) +
  geom_line()+geom_point() +
  labs(x="Months", colour="Year",title="No of crashes in Melbourne(Jan 2012 to Jul 2017)") +
  theme_classic()
  
p6<-ggplot(crosstab3, aes(DAY_OF_WEEK,COUNT, group=YEAR, colour=YEAR)) +
  geom_line()+geom_point() +
  labs(x="Day of Week", colour="YEAR",
       title="Distribtion of crashes for day of week(Jan 2012 to Jul2017)")+theme_classic()

p7<-ggplot(Crashes_sum,aes(x = LIGHT_CONDITION, y = Proportion,fill=LIGHT_CONDITION))+ geom_bar(stat="identity")+coord_flip()+theme(legend.position ="NONE")+
  labs(x="Light conditions",title="Distribtion of crashes as per light conditions during crash(Jan 2012 to Jul2017)",caption="Source: https://www.data.vic.gov.au/data/dataset/crashes-last-five-years")


gridExtra::grid.arrange(p5,p6,p4,p7,top="Analysis of crashes in Melbourne(Jan 2012- Jul 2017)")

The above grid has four plots starting from the first one which is a line graph describing the trend in crashes for each month through the years (Jan 2012 to July 2017) in Melbourne and we can see approximately all the months have similar trend but in 2017 upto July there seems to be a drop in the number of crashes compared to other years, probably the measures taken by victorian government for reducing the number of crashes is working. Second plot in grid shows distribution of crashes on days of week and we can see Thursday and Friday have maximum crashes. Third part of the graph depicts region wise distribution of crashes and the severity (serious injury, other injury and fatal crashes) of the crashes in each region. We can clearly see that the number of crashes is high in Metrploitian regions because the population is dense in the metropolitian regions. Compared to serious injuries other types of injuries are more whereas fatal accidents are relatively low in all the regions. The last part of the grid depicts a plot presenting light conditions during the crash, the plot shows maximum crashes occur during day light.