ACCIDENTS EDA

This database contains information about road accidents that happened in 2017 across all counties in Kenya;

Task :

Data Processing, ie cleaning and manipulating the data set into a data-frame for use in analysis
Exploratory analysis and visualization of the data.

Setting up environment

Notes:The following packages were used(description shown)

library(tidyverse)#For data cleaning and organizing

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.4      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(ggplot2)#For plotting
library(knitr)#For use with rmarkdown in presentation
library(lubridate)#For date use and conversion

## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(rmarkdown)#For creating a sharable document
library(janitor)#For easing data cleaning

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(skimr)#For easy summary
library(plotly)#For interactive plots

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

IMPORT THE DATA(PREPARE)

Notes:Three data sets were imported for this analysis

#used only two data sets that covered 24 hours(initially)
#added weight data set to get context
Test <- read_csv("C:\\Users\\AAH\\Downloads\\Data_intern_task\\kenya-accidents-database.csv")

## Rows: 740 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): TIME 24 HOURS, BASE/SUB BASE, COUNTY, ROAD, PLACE, MV INVOLVED, BR...
## dbl  (1): Month
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

colnames(Test)

##  [1] "TIME 24 HOURS"          "BASE/SUB BASE"          "COUNTY"                
##  [4] "ROAD"                   "PLACE"                  "MV INVOLVED"           
##  [7] "BRIEF ACCIDENT DETAILS" "NAME OF VICTIM"         "GENDER"                
## [10] "AGE"                    "CAUSE CODE"             "VICTIM"                
## [13] "NO."                    "Date DD/MM/YYYY"        "Hour"                  
## [16] "Month"

colnames(Test) <- make.names(colnames(Test))

PREVIEW THE DATA AND FIND THE STRUCTURE

Notes: Just the first 6 rows & then look at the structure

View(Test)
dim(Test)

## [1] 740  16

summary(Test)

##  TIME.24.HOURS      BASE.SUB.BASE         COUNTY              ROAD          
##  Length:740         Length:740         Length:740         Length:740        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     PLACE           MV.INVOLVED        BRIEF.ACCIDENT.DETAILS
##  Length:740         Length:740         Length:740            
##  Class :character   Class :character   Class :character      
##  Mode  :character   Mode  :character   Mode  :character      
##                                                              
##                                                              
##                                                              
##  NAME.OF.VICTIM        GENDER              AGE             CAUSE.CODE       
##  Length:740         Length:740         Length:740         Length:740        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     VICTIM              NO.            Date.DD.MM.YYYY        Hour          
##  Length:740         Length:740         Length:740         Length:740        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      Month       
##  Min.   : 2.000  
##  1st Qu.: 5.000  
##  Median : 7.000  
##  Mean   : 6.557  
##  3rd Qu.: 8.000  
##  Max.   :11.000

colnames(Test)

##  [1] "TIME.24.HOURS"          "BASE.SUB.BASE"          "COUNTY"                
##  [4] "ROAD"                   "PLACE"                  "MV.INVOLVED"           
##  [7] "BRIEF.ACCIDENT.DETAILS" "NAME.OF.VICTIM"         "GENDER"                
## [10] "AGE"                    "CAUSE.CODE"             "VICTIM"                
## [13] "NO."                    "Date.DD.MM.YYYY"        "Hour"                  
## [16] "Month"

CLEANING(PROCESS)

Generate a variable that indicates that the data is owned by HDX

Notes: Check is done by a simple colnames function

Test$OWNER <- "HDX"
#Check if names have changed
colnames(Test)

##  [1] "TIME.24.HOURS"          "BASE.SUB.BASE"          "COUNTY"                
##  [4] "ROAD"                   "PLACE"                  "MV.INVOLVED"           
##  [7] "BRIEF.ACCIDENT.DETAILS" "NAME.OF.VICTIM"         "GENDER"                
## [10] "AGE"                    "CAUSE.CODE"             "VICTIM"                
## [13] "NO."                    "Date.DD.MM.YYYY"        "Hour"                  
## [16] "Month"                  "OWNER"

Rename Date DD/MM/YYYY to Date, VICTIM to VICTIM_TYPE and BASE/SUB BASE to BASE_SUB BASE

Notes: piping makes it easy to combine functions. You’ll see a lot of piping as we go along

Test2 <- Test %>%
  rename(Date = `Date.DD.MM.YYYY`, VICTIM_TYPE = VICTIM, BASE_SUB_BASE = `BASE.SUB.BASE`)
# View updated dataframe
colnames(Test2)

##  [1] "TIME.24.HOURS"          "BASE_SUB_BASE"          "COUNTY"                
##  [4] "ROAD"                   "PLACE"                  "MV.INVOLVED"           
##  [7] "BRIEF.ACCIDENT.DETAILS" "NAME.OF.VICTIM"         "GENDER"                
## [10] "AGE"                    "CAUSE.CODE"             "VICTIM_TYPE"           
## [13] "NO."                    "Date"                   "Hour"                  
## [16] "Month"                  "OWNER"

Test2

## # A tibble: 740 × 17
##    TIME.24.HOURS BASE_…¹ COUNTY ROAD  PLACE MV.IN…² BRIEF…³ NAME.…⁴ GENDER AGE  
##    <chr>         <chr>   <chr>  <chr> <chr> <chr>   <chr>   <chr>   <chr>  <chr>
##  1 0139          DTEO D… NAIRO… WAIY… KANG… KBR 74… VEHICL… UNKNOWN M      A    
##  2 0845          SAGANA  MURAN… SAGA… KAIN… KBY 06… VEHICL… DICKSO… M      61   
##  3 0930          MOGOTIO BARIN… EMIN… OTER… UNKNOWN VEHICL… RODGER… M      31   
##  4 1250          NAKURU  NAKURU NAIR… MARU… KBM 53… THE VE… ISAAC … M      A    
##  5 1730          KURIA   MIGORI NTIM… TARA… GK B 5… VEHICL… CHARLE… M      J    
##  6 1930          SAGANA  KIRIN… KENO… WACH… KBU 59… THE CY… UNKNOWN M      A    
##  7 2039          KANGUN… MAKUE… NAIR… MAKE… KCC 37… THE VE… BENARD… M      36   
##  8 2145          DTEO K… NAIRO… THIK… CAR … KCH 35… VEHICL… COLLIN… M      20   
##  9 0730          MURANGA MURAN… GAKO… KAMA… KCJ 92… THE VE… UNKNOWN M      A    
## 10 0930          THIKA   KIAMBU THIK… KAND… KBQ 57… VEHICL… FREDRI… M      A    
## # … with 730 more rows, 7 more variables: CAUSE.CODE <chr>, VICTIM_TYPE <chr>,
## #   NO. <chr>, Date <chr>, Hour <chr>, Month <dbl>, OWNER <chr>, and
## #   abbreviated variable names ¹BASE_SUB_BASE, ²MV.INVOLVED,
## #   ³BRIEF.ACCIDENT.DETAILS, ⁴NAME.OF.VICTIM

Remove irrelevant variables from the dataset such as No., name of victim, places

Notes: The Select function will enable this. The “-c” translates to remove columns

Test3<- Test2 %>%
  select(-c("NAME.OF.VICTIM", "PLACE", "NO.", "BRIEF.ACCIDENT.DETAILS"))
# View updated dataframe
colnames(Test3)

##  [1] "TIME.24.HOURS" "BASE_SUB_BASE" "COUNTY"        "ROAD"         
##  [5] "MV.INVOLVED"   "GENDER"        "AGE"           "CAUSE.CODE"   
##  [9] "VICTIM_TYPE"   "Date"          "Hour"          "Month"        
## [13] "OWNER"

Test3

## # A tibble: 740 × 13
##    TIME.…¹ BASE_…² COUNTY ROAD  MV.IN…³ GENDER AGE   CAUSE…⁴ VICTI…⁵ Date  Hour 
##    <chr>   <chr>   <chr>  <chr> <chr>   <chr>  <chr> <chr>   <chr>   <chr> <chr>
##  1 0139    DTEO D… NAIRO… WAIY… KBR 74… M      A     98      PEDEST… 2/22… 1    
##  2 0845    SAGANA  MURAN… SAGA… KBY 06… M      61    29      P/CYCL… 2/22… 8    
##  3 0930    MOGOTIO BARIN… EMIN… UNKNOWN M      31    30      PEDEST… 2/22… 9    
##  4 1250    NAKURU  NAKURU NAIR… KBM 53… M      A     37      DRIVER  2/22… 12   
##  5 1730    KURIA   MIGORI NTIM… GK B 5… M      J     36      PEDEST… 2/22… 17   
##  6 1930    SAGANA  KIRIN… KENO… KBU 59… M      A     37      RIDER … 2/22… 19   
##  7 2039    KANGUN… MAKUE… NAIR… KCC 37… M      36    82      RIDER   2/22… 20   
##  8 2145    DTEO K… NAIRO… THIK… KCH 35… M      20    98      PEDEST… 2/22… 21   
##  9 0730    MURANGA MURAN… GAKO… KCJ 92… M      A     26      PASSEN… 2/28… 7    
## 10 0930    THIKA   KIAMBU THIK… KBQ 57… M      A     10      P/CYCL… 2/28… 9    
## # … with 730 more rows, 2 more variables: Month <dbl>, OWNER <chr>, and
## #   abbreviated variable names ¹TIME.24.HOURS, ²BASE_SUB_BASE, ³MV.INVOLVED,
## #   ⁴CAUSE.CODE, ⁵VICTIM_TYPE

Remove records that time is unkown

Notes: Here we first get a view of the column, then filter out unwanted recors from the column. Next we substitute the records with HRS after them with nothing and finally confirm if the data set is clean enough

Test3$TIME.24.HOURS

##   [1] "0139"         "0845"         "0930"         "1250"         "1730"        
##   [6] "1930"         "2039"         "2145"         "0730"         "0930"        
##  [11] "1230"         "1850"         "2030"         "0730"         "0800"        
##  [16] "1000"         "1800"         "1800"         "2230"         "0600"        
##  [21] "0730"         "1205"         "1230"         "1230"         "1600"        
##  [26] "1630"         "1830"         "2045"         "2200"         "0300"        
##  [31] "1220"         "1430"         "1530"         "1730"         "2030"        
##  [36] "0630"         "0730"         "0930"         "1400"         "1500"        
##  [41] "2030"         "2030"         "2030"         "3000"         "0130"        
##  [46] "0230"         "0700"         "1030"         "1145"         "1610"        
##  [51] "1800"         "2130"         "3000"         "0840"         "1720"        
##  [56] "1800"         "2045"         "0840"         "1930"         "1930"        
##  [61] "1600"         "1830"         "2000"         "2000"         "2050"        
##  [66] "2230"         "1100"         "2100"         "0330"         "0330"        
##  [71] "1130"         "1430"         "2100"         "0610"         "1200"        
##  [76] "1500"         "1900"         "2030"         "2120"         "0330"        
##  [81] "0430"         "0445"         "0630"         "0745"         "1230"        
##  [86] "1530"         "1800"         "1830"         "2205"         "2215"        
##  [91] "1530"         "1930"         "2030"         "3000"         "0615"        
##  [96] "0930"         "0930"         "1330"         "1400"         "1500"        
## [101] "1540"         "1600"         "1700"         "1900"         "2115"        
## [106] "1400"         "1845"         "0730"         "0730"         "0930"        
## [111] "1330"         "1830"         "0220HRS"      "0600HRS"      "0810"        
## [116] "1230"         "1445"         "1530HRS"      "1730HRS"      "1830HRS"     
## [121] "845HRS"       "0710"         "1350"         "1620"         "1945"        
## [126] "NAROK"        "0630"         "0630"         "1820"         "2100"        
## [131] "0500hrs"      "1100"         "1550"         "1700"         "1725"        
## [136] "1930"         "2230"         "0045 HRS"     "0600 HRS"     "0700 HRS"    
## [141] "1800"         "1800"         "0835"         "1030"         "1100"        
## [146] "1600"         "1630"         "1900"         "UNKNOWN TIME" "0630"        
## [151] "1030"         "1500"         "1900"         "1930"         "2100"        
## [156] "2100"         "2330"         "0900"         "1200"         "1430"        
## [161] "1438"         "1645"         "1700"         "1845"         "1930"        
## [166] "2030"         "2100"         "2110"         "4000"         "0630"        
## [171] "1340"         "1610"         "1800"         "1856"         "UNKNOWN"     
## [176] "0715"         "1330"         "2030"         "0230"         "0730"        
## [181] "1130"         "1700"         "1955"         "2045"         "2230"        
## [186] "2345"         "0230"         "0900"         "1540"         "1930"        
## [191] "2000"         "2000"         "2320"         "1000"         "1030"        
## [196] "1730"         "1730"         "1750"         "1900"         "2100"        
## [201] "2245"         "2330"         "1100"         "1200"         "2330"        
## [206] "1030"         "2000"         "0630"         "0730"         "1100"        
## [211] "1830"         "1900"         "1940"         "2030"         "2130"        
## [216] "0600"         "1230"         "1410"         "1700"         "1800"        
## [221] "1945"         "2115"         "0130"         "0530"         "1200"        
## [226] "1645"         "1700"         "1720"         "2140"         "0530"        
## [231] "0530"         "1140"         "1230"         "1600"         "1745"        
## [236] "2100"         "1000"         "1140"         "1730"         "0210"        
## [241] "1100"         "1300"         "1330"         "1600"         "1630"        
## [246] "1900"         "1930"         "2040"         "UNKNOWN"      "0220"        
## [251] "0645"         "0900"         "2015"         "2100"         "0400"        
## [256] "1550"         "1900"         "2030"         "2230"         "1330"        
## [261] "1600"         "1830"         "1900"         "1900"         "1900"        
## [266] "2015"         "2030"         "1600"         "1700"         "1900"        
## [271] "2000"         "2030"         "1200"         "2015"         "2015"        
## [276] "2100"         "2200"         "0500"         "1200"         "1430"        
## [281] "1630"         "1730"         "3000"         "1230"         "1500"        
## [286] "2300"         "0820"         "1500"         "2330"         "4500"        
## [291] "0500"         "0700"         "0730"         "0730"         "2000"        
## [296] "2100"         "2200"         "0730"         "1000"         "1720"        
## [301] "1945"         "2000"         "2000"         "2145"         "2230"        
## [306] "0730"         "1100"         "1730"         "2130"         "0430"        
## [311] "1030"         "2030"         "2230"         "1720"         "2030"        
## [316] "2240"         "2240"         "2300"         "1740"         "1800"        
## [321] "0700"         "1230"         "1645"         "1750"         "2000"        
## [326] "0400"         "0700"         "1130"         "1730"         "1900"        
## [331] "1940"         "2030"         "2040"         "2200"         "2330"        
## [336] "2330"         "0800"         "1120"         "1350"         "1730"        
## [341] "1730"         "1800"         "2120"         "2145"         "2155"        
## [346] "1000"         "2030"         "2100"         "0600"         "1400"        
## [351] "1520"         "2000"         "2130"         "2330"         "2345"        
## [356] "0100"         "2140"         "0200"         "1030"         "1300"        
## [361] "1600"         "1600"         "1830"         "1945"         "1130"        
## [366] "1530"         "1700"         "1730"         "2000"         "0400"        
## [371] "1730"         "1900"         "2020"         "2100"         "2100"        
## [376] "2130"         "2345"         "3000"         "1500"         "1500"        
## [381] "1600"         "1600"         "1830"         "2000"         "2030"        
## [386] "2050"         "0215"         "1050"         "1800"         "1800"        
## [391] "1830"         "2030"         "2100"         "2300"         "2330"        
## [396] "0800"         "1815"         "1930"         "2030"         "2230"        
## [401] "2230"         "0230"         "0600"         "0830"         "1530"        
## [406] "1730"         "2000"         "UNKNOWN"      "UNKNOWN"      "UNKNOWN"     
## [411] "0230"         "0730"         "1445"         "1540"         "1710"        
## [416] "1830"         "2200"         "0700"         "1100"         "1130"        
## [421] "1810"         "2050"         "0240"         "0530"         "0630"        
## [426] "1130"         "1500"         "1500"         "1500"         "1520"        
## [431] "1840"         "2245"         "0630"         "1050"         "1200"        
## [436] "1750"         "2000"         "0600"         "1400"         "1730"        
## [441] "1830"         "1845"         "1900"         "1900"         "2050"        
## [446] "2230"         "0845"         "1130"         "1630"         "1700"        
## [451] "1930"         "2000"         "2100"         "2145"         "UNKNOWN TIME"
## [456] "UNKNOWN TIME" "UNKNOWN TIME" "UNKNOWN TIME" "1700"         "0930"        
## [461] "1045"         "1400"         "1430"         "2000"         "2010"        
## [466] "UNKNOWN"      "0600"         "0830"         "1250"         "1330"        
## [471] "1530"         "1830"         "2200"         "0320"         "1400"        
## [476] "1800"         "1900"         "2030"         "2140"         "0700"        
## [481] "1000"         "1630"         "1700"         "2120"         "2130"        
## [486] "2300"         "3000"         "0400"         "0630"         "1230"        
## [491] "1800"         "2230"         "2320"         "0545"         "0620"        
## [496] "1300"         "1800"         "1830"         "2200"         "2200"        
## [501] "1630"         "1730"         "1740"         "1830"         "1830"        
## [506] "2300"         "0810"         "0900"         "2130"         "1730"        
## [511] "0230"         "0430"         "1000"         "1900"         "2000"        
## [516] "2000"         "2230"         "2300"         "2300"         "2330"        
## [521] "4000"         "#N/A"         "#N/A"         "0440"         "1200"        
## [526] "1430"         "1430"         "2100"         "#N/A"         "1630"        
## [531] "1745"         "2030"         "3000"         "5000"         "1800"        
## [536] "1930"         "1150"         "2200"         "1840"         "2100"        
## [541] "2130"         "0700"         "0700"         "0715"         "1200"        
## [546] "1630"         "0800"         "0930"         "1030"         "1100"        
## [551] "1400"         "1720"         "0700"         "0830"         "1440"        
## [556] "1645"         "0930"         "1240"         "1440"         "0940"        
## [561] "1130"         "1150"         "1230"         "1600"         "1600"        
## [566] "1630"         "1630"         "1700"         "1740"         "1810"        
## [571] "1845"         "1900"         "1930"         "1945"         "2000"        
## [576] "2000"         "2130"         "2230"         "2300"         "2300"        
## [581] "2300"         "0600"         "0630"         "1030"         "1320"        
## [586] "1820"         "2055"         "1420"         "1440"         "0100"        
## [591] "0130"         "0400"         "0700"         "1000"         "1230"        
## [596] "1945"         "2200"         "2315"         "UNKNOWN TIME" "1530"        
## [601] "1530"         "1900"         "UNKNOWN TIME" "0640"         "1100"        
## [606] "1345"         "2000"         "2010"         "1545"         "1700"        
## [611] "1945"         "2000"         "2230"         "1200"         "1330"        
## [616] "1330"         "1730"         "2340"         "5"            "0130"        
## [621] "0430"         "0630"         "0630"         "1640"         "1645"        
## [626] "2030"         "2130"         "2200"         "2345"         "3000"        
## [631] "5"            "5"            "0430"         "0600"         "1630"        
## [636] "2050"         "2130"         "2230"         "0720"         "1630"        
## [641] "1800"         "1820"         "1900"         "2000"         "2030"        
## [646] "2040"         "0345"         "1430"         "1530"         "1600"        
## [651] "1700"         "2030"         "2140"         "2210"         "UNKNOWN"     
## [656] "0600"         "0700"         "1000"         "2200"         "2245"        
## [661] "0730"         "1800"         "2000"         "2040"         "0800"        
## [666] "0900"         "1350"         "2000"         "2130"         "0630"        
## [671] "1030"         "1210"         "1400"         "1000"         "1000"        
## [676] "1030"         "1330"         "1400"         "1600"         "1640"        
## [681] "1830"         "1900"         "2000"         "2140"         "2230"        
## [686] "UNKNOWN TIME" "0700"         "1200"         "1200"         "1330"        
## [691] "1330"         "2000"         "2030"         "1000"         "1630"        
## [696] "1630"         "1920"         "0600"         "1130"         "1440"        
## [701] "1640"         "1920"         "2030"         "1630"         "2000"        
## [706] "2015"         "2025"         "2030"         "2045"         "2130"        
## [711] "2230"         "0430"         "1300"         "1530"         "1630"        
## [716] "1745"         "1940"         "1030"         "1300"         "1730"        
## [721] "2000"         "2030"         "0130"         "0630"         "0745"        
## [726] "1030"         "1100"         "1330"         "1730"         "2136"        
## [731] "0745"         "1430"         "1515"         "1600"         "1745"        
## [736] "1747"         "1830"         "1930"         "2030"         "2300"

Test4 <- Test3 %>% 
  filter(TIME.24.HOURS != "UNKNOWN TIME")%>%
  filter(TIME.24.HOURS != "UNKNOWN")%>%
  filter(TIME.24.HOURS != "#N/A")%>%
  filter(TIME.24.HOURS != "NAROK")
# Remove "hrs" from time  records
Test4$TIME.24.HOURS <- gsub("HRS", "", Test4$TIME.24.HOURS)  
# View updated dataframe
colnames(Test4)

##  [1] "TIME.24.HOURS" "BASE_SUB_BASE" "COUNTY"        "ROAD"         
##  [5] "MV.INVOLVED"   "GENDER"        "AGE"           "CAUSE.CODE"   
##  [9] "VICTIM_TYPE"   "Date"          "Hour"          "Month"        
## [13] "OWNER"

# Check if time column is clean 
Test4$TIME.24.HOURS

##   [1] "0139"    "0845"    "0930"    "1250"    "1730"    "1930"    "2039"   
##   [8] "2145"    "0730"    "0930"    "1230"    "1850"    "2030"    "0730"   
##  [15] "0800"    "1000"    "1800"    "1800"    "2230"    "0600"    "0730"   
##  [22] "1205"    "1230"    "1230"    "1600"    "1630"    "1830"    "2045"   
##  [29] "2200"    "0300"    "1220"    "1430"    "1530"    "1730"    "2030"   
##  [36] "0630"    "0730"    "0930"    "1400"    "1500"    "2030"    "2030"   
##  [43] "2030"    "3000"    "0130"    "0230"    "0700"    "1030"    "1145"   
##  [50] "1610"    "1800"    "2130"    "3000"    "0840"    "1720"    "1800"   
##  [57] "2045"    "0840"    "1930"    "1930"    "1600"    "1830"    "2000"   
##  [64] "2000"    "2050"    "2230"    "1100"    "2100"    "0330"    "0330"   
##  [71] "1130"    "1430"    "2100"    "0610"    "1200"    "1500"    "1900"   
##  [78] "2030"    "2120"    "0330"    "0430"    "0445"    "0630"    "0745"   
##  [85] "1230"    "1530"    "1800"    "1830"    "2205"    "2215"    "1530"   
##  [92] "1930"    "2030"    "3000"    "0615"    "0930"    "0930"    "1330"   
##  [99] "1400"    "1500"    "1540"    "1600"    "1700"    "1900"    "2115"   
## [106] "1400"    "1845"    "0730"    "0730"    "0930"    "1330"    "1830"   
## [113] "0220"    "0600"    "0810"    "1230"    "1445"    "1530"    "1730"   
## [120] "1830"    "845"     "0710"    "1350"    "1620"    "1945"    "0630"   
## [127] "0630"    "1820"    "2100"    "0500hrs" "1100"    "1550"    "1700"   
## [134] "1725"    "1930"    "2230"    "0045 "   "0600 "   "0700 "   "1800"   
## [141] "1800"    "0835"    "1030"    "1100"    "1600"    "1630"    "1900"   
## [148] "0630"    "1030"    "1500"    "1900"    "1930"    "2100"    "2100"   
## [155] "2330"    "0900"    "1200"    "1430"    "1438"    "1645"    "1700"   
## [162] "1845"    "1930"    "2030"    "2100"    "2110"    "4000"    "0630"   
## [169] "1340"    "1610"    "1800"    "1856"    "0715"    "1330"    "2030"   
## [176] "0230"    "0730"    "1130"    "1700"    "1955"    "2045"    "2230"   
## [183] "2345"    "0230"    "0900"    "1540"    "1930"    "2000"    "2000"   
## [190] "2320"    "1000"    "1030"    "1730"    "1730"    "1750"    "1900"   
## [197] "2100"    "2245"    "2330"    "1100"    "1200"    "2330"    "1030"   
## [204] "2000"    "0630"    "0730"    "1100"    "1830"    "1900"    "1940"   
## [211] "2030"    "2130"    "0600"    "1230"    "1410"    "1700"    "1800"   
## [218] "1945"    "2115"    "0130"    "0530"    "1200"    "1645"    "1700"   
## [225] "1720"    "2140"    "0530"    "0530"    "1140"    "1230"    "1600"   
## [232] "1745"    "2100"    "1000"    "1140"    "1730"    "0210"    "1100"   
## [239] "1300"    "1330"    "1600"    "1630"    "1900"    "1930"    "2040"   
## [246] "0220"    "0645"    "0900"    "2015"    "2100"    "0400"    "1550"   
## [253] "1900"    "2030"    "2230"    "1330"    "1600"    "1830"    "1900"   
## [260] "1900"    "1900"    "2015"    "2030"    "1600"    "1700"    "1900"   
## [267] "2000"    "2030"    "1200"    "2015"    "2015"    "2100"    "2200"   
## [274] "0500"    "1200"    "1430"    "1630"    "1730"    "3000"    "1230"   
## [281] "1500"    "2300"    "0820"    "1500"    "2330"    "4500"    "0500"   
## [288] "0700"    "0730"    "0730"    "2000"    "2100"    "2200"    "0730"   
## [295] "1000"    "1720"    "1945"    "2000"    "2000"    "2145"    "2230"   
## [302] "0730"    "1100"    "1730"    "2130"    "0430"    "1030"    "2030"   
## [309] "2230"    "1720"    "2030"    "2240"    "2240"    "2300"    "1740"   
## [316] "1800"    "0700"    "1230"    "1645"    "1750"    "2000"    "0400"   
## [323] "0700"    "1130"    "1730"    "1900"    "1940"    "2030"    "2040"   
## [330] "2200"    "2330"    "2330"    "0800"    "1120"    "1350"    "1730"   
## [337] "1730"    "1800"    "2120"    "2145"    "2155"    "1000"    "2030"   
## [344] "2100"    "0600"    "1400"    "1520"    "2000"    "2130"    "2330"   
## [351] "2345"    "0100"    "2140"    "0200"    "1030"    "1300"    "1600"   
## [358] "1600"    "1830"    "1945"    "1130"    "1530"    "1700"    "1730"   
## [365] "2000"    "0400"    "1730"    "1900"    "2020"    "2100"    "2100"   
## [372] "2130"    "2345"    "3000"    "1500"    "1500"    "1600"    "1600"   
## [379] "1830"    "2000"    "2030"    "2050"    "0215"    "1050"    "1800"   
## [386] "1800"    "1830"    "2030"    "2100"    "2300"    "2330"    "0800"   
## [393] "1815"    "1930"    "2030"    "2230"    "2230"    "0230"    "0600"   
## [400] "0830"    "1530"    "1730"    "2000"    "0230"    "0730"    "1445"   
## [407] "1540"    "1710"    "1830"    "2200"    "0700"    "1100"    "1130"   
## [414] "1810"    "2050"    "0240"    "0530"    "0630"    "1130"    "1500"   
## [421] "1500"    "1500"    "1520"    "1840"    "2245"    "0630"    "1050"   
## [428] "1200"    "1750"    "2000"    "0600"    "1400"    "1730"    "1830"   
## [435] "1845"    "1900"    "1900"    "2050"    "2230"    "0845"    "1130"   
## [442] "1630"    "1700"    "1930"    "2000"    "2100"    "2145"    "1700"   
## [449] "0930"    "1045"    "1400"    "1430"    "2000"    "2010"    "0600"   
## [456] "0830"    "1250"    "1330"    "1530"    "1830"    "2200"    "0320"   
## [463] "1400"    "1800"    "1900"    "2030"    "2140"    "0700"    "1000"   
## [470] "1630"    "1700"    "2120"    "2130"    "2300"    "3000"    "0400"   
## [477] "0630"    "1230"    "1800"    "2230"    "2320"    "0545"    "0620"   
## [484] "1300"    "1800"    "1830"    "2200"    "2200"    "1630"    "1730"   
## [491] "1740"    "1830"    "1830"    "2300"    "0810"    "0900"    "2130"   
## [498] "1730"    "0230"    "0430"    "1000"    "1900"    "2000"    "2000"   
## [505] "2230"    "2300"    "2300"    "2330"    "4000"    "0440"    "1200"   
## [512] "1430"    "1430"    "2100"    "1630"    "1745"    "2030"    "3000"   
## [519] "5000"    "1800"    "1930"    "1150"    "2200"    "1840"    "2100"   
## [526] "2130"    "0700"    "0700"    "0715"    "1200"    "1630"    "0800"   
## [533] "0930"    "1030"    "1100"    "1400"    "1720"    "0700"    "0830"   
## [540] "1440"    "1645"    "0930"    "1240"    "1440"    "0940"    "1130"   
## [547] "1150"    "1230"    "1600"    "1600"    "1630"    "1630"    "1700"   
## [554] "1740"    "1810"    "1845"    "1900"    "1930"    "1945"    "2000"   
## [561] "2000"    "2130"    "2230"    "2300"    "2300"    "2300"    "0600"   
## [568] "0630"    "1030"    "1320"    "1820"    "2055"    "1420"    "1440"   
## [575] "0100"    "0130"    "0400"    "0700"    "1000"    "1230"    "1945"   
## [582] "2200"    "2315"    "1530"    "1530"    "1900"    "0640"    "1100"   
## [589] "1345"    "2000"    "2010"    "1545"    "1700"    "1945"    "2000"   
## [596] "2230"    "1200"    "1330"    "1330"    "1730"    "2340"    "5"      
## [603] "0130"    "0430"    "0630"    "0630"    "1640"    "1645"    "2030"   
## [610] "2130"    "2200"    "2345"    "3000"    "5"       "5"       "0430"   
## [617] "0600"    "1630"    "2050"    "2130"    "2230"    "0720"    "1630"   
## [624] "1800"    "1820"    "1900"    "2000"    "2030"    "2040"    "0345"   
## [631] "1430"    "1530"    "1600"    "1700"    "2030"    "2140"    "2210"   
## [638] "0600"    "0700"    "1000"    "2200"    "2245"    "0730"    "1800"   
## [645] "2000"    "2040"    "0800"    "0900"    "1350"    "2000"    "2130"   
## [652] "0630"    "1030"    "1210"    "1400"    "1000"    "1000"    "1030"   
## [659] "1330"    "1400"    "1600"    "1640"    "1830"    "1900"    "2000"   
## [666] "2140"    "2230"    "0700"    "1200"    "1200"    "1330"    "1330"   
## [673] "2000"    "2030"    "1000"    "1630"    "1630"    "1920"    "0600"   
## [680] "1130"    "1440"    "1640"    "1920"    "2030"    "1630"    "2000"   
## [687] "2015"    "2025"    "2030"    "2045"    "2130"    "2230"    "0430"   
## [694] "1300"    "1530"    "1630"    "1745"    "1940"    "1030"    "1300"   
## [701] "1730"    "2000"    "2030"    "0130"    "0630"    "0745"    "1030"   
## [708] "1100"    "1330"    "1730"    "2136"    "0745"    "1430"    "1515"   
## [715] "1600"    "1745"    "1747"    "1830"    "1930"    "2030"    "2300"

Arrange your dataframe to start with Month, time and car involved

Notes: We see the Select column come into play again. this time it is used in ordering column. Quite the powerful function as you can see

Test5  <- Test4 %>% select(Month, TIME.24.HOURS, MV.INVOLVED, everything())
# View updated data frame
Test5

## # A tibble: 721 × 13
##    Month TIME.…¹ MV.IN…² BASE_…³ COUNTY ROAD  GENDER AGE   CAUSE…⁴ VICTI…⁵ Date 
##    <dbl> <chr>   <chr>   <chr>   <chr>  <chr> <chr>  <chr> <chr>   <chr>   <chr>
##  1     2 0139    KBR 74… DTEO D… NAIRO… WAIY… M      A     98      PEDEST… 2/22…
##  2     2 0845    KBY 06… SAGANA  MURAN… SAGA… M      61    29      P/CYCL… 2/22…
##  3     2 0930    UNKNOWN MOGOTIO BARIN… EMIN… M      31    30      PEDEST… 2/22…
##  4     2 1250    KBM 53… NAKURU  NAKURU NAIR… M      A     37      DRIVER  2/22…
##  5     2 1730    GK B 5… KURIA   MIGORI NTIM… M      J     36      PEDEST… 2/22…
##  6     2 1930    KBU 59… SAGANA  KIRIN… KENO… M      A     37      RIDER … 2/22…
##  7     2 2039    KCC 37… KANGUN… MAKUE… NAIR… M      36    82      RIDER   2/22…
##  8     2 2145    KCH 35… DTEO K… NAIRO… THIK… M      20    98      PEDEST… 2/22…
##  9     2 0730    KCJ 92… MURANGA MURAN… GAKO… M      A     26      PASSEN… 2/28…
## 10     2 0930    KBQ 57… THIKA   KIAMBU THIK… M      A     10      P/CYCL… 2/28…
## # … with 711 more rows, 2 more variables: Hour <chr>, OWNER <chr>, and
## #   abbreviated variable names ¹TIME.24.HOURS, ²MV.INVOLVED, ³BASE_SUB_BASE,
## #   ⁴CAUSE.CODE, ⁵VICTIM_TYPE

colnames(Test5)

##  [1] "Month"         "TIME.24.HOURS" "MV.INVOLVED"   "BASE_SUB_BASE"
##  [5] "COUNTY"        "ROAD"          "GENDER"        "AGE"          
##  [9] "CAUSE.CODE"    "VICTIM_TYPE"   "Date"          "Hour"         
## [13] "OWNER"

Sort Victim type in descending order

Notes: A new desc function is used and your guess is as good as mine in its use

Test6 <- Test5 %>% arrange(desc(VICTIM_TYPE))
# View updated dataframe
Test6

## # A tibble: 721 × 13
##    Month TIME.…¹ MV.IN…² BASE_…³ COUNTY ROAD  GENDER AGE   CAUSE…⁴ VICTI…⁵ Date 
##    <dbl> <chr>   <chr>   <chr>   <chr>  <chr> <chr>  <chr> <chr>   <chr>   <chr>
##  1     3 0930    M/CYCL… NANDI … NANDI  KABI… M      25&28 37      RIDERS  3/23…
##  2     8 1000    KBU666… NYARI   MOMBA… OLD … M      A     16      RIDER … 8/4/…
##  3     2 1930    KBU 59… SAGANA  KIRIN… KENO… M      A     37      RIDER … 2/22…
##  4     3 1500    KCF 03… PAP ON… KISUMU KATI… M&F    42,2… 26      RIDER … 3/18…
##  5     3 0930    KYX 55… KANDARA MURAN… MAKE… M      A     37      RIDER … 3/21…
##  6     3 1400    UNREGI… DTEO G… NAIRO… PROF… M      A     36      RIDER … 3/21…
##  7     3 1540    KCB 80… SAGANA  KIRIN… SAGA… M      21 &… 26      RIDER … 3/21…
##  8     3 1700    KCX 81… AWENDO  MIGORI AWEN… M      A     50      RIDER … 3/21…
##  9     3 0730    M/CYCL… MUMIAS  KAKAM… EKER… M & F… 52 &… 98      RIDER … 3/23…
## 10     2 2039    KCC 37… KANGUN… MAKUE… NAIR… M      36    82      RIDER   2/22…
## # … with 711 more rows, 2 more variables: Hour <chr>, OWNER <chr>, and
## #   abbreviated variable names ¹TIME.24.HOURS, ²MV.INVOLVED, ³BASE_SUB_BASE,
## #   ⁴CAUSE.CODE, ⁵VICTIM_TYPE

Check duplicated records in terms of cause code, MV involved and base

Notes: First we list the duplicated functions in the specified column using the duplicated() function. Next We remove them and confirm this with the nrows function

duplicated_records <- Test6 %>% select(CAUSE.CODE, MV.INVOLVED, BASE_SUB_BASE) %>% 
  duplicated()
# Print the duplicated records
print(Test6[duplicated_records,])

## # A tibble: 21 × 13
##    Month TIME.…¹ MV.IN…² BASE_…³ COUNTY ROAD  GENDER AGE   CAUSE…⁴ VICTI…⁵ Date 
##    <dbl> <chr>   <chr>   <chr>   <chr>  <chr> <chr>  <chr> <chr>   <chr>   <chr>
##  1    10 0700    UNKNOW… MAKUENI MAKUE… MBUN… M      57    98      PEDEST… 10/2…
##  2    11 0630    UNKNOWN EMBAKA… NAIRO… MOMB… M      A     98      PEDEST… 11/5…
##  3     7 1600    KBT 33… KISII   KISII  KISI… M      A     26      PASSEN… 7/14…
##  4     3 2030    KMDR 7… MASENO  KISUMU KISU… M      A     #N/A    M/CYCL… 3/10…
##  5     7 2100    KBA 18… BUSIA   BUSIA  BUTU… M      A     79      M/CYCL… 7/16…
##  6     7 1500    KCF 27… BONDO   SIAYA  BOND… M      A     10      M/CYCL… 7/19…
##  7     8 2300    KMCL 8… MAGUMU  NYAND… FLYO… M      A     68      M/CYCL… 8/11…
##  8     8 1630    KBK 66… MAUA    MERU   MERU… M      A     26      M/CYCL… 8/27…
##  9     8 2000    KCJ 68… BUTERE  KAKAM… SHIA… M      A     8       M/CYCL… 8/27…
## 10     8 2300    KBU 17… SOTIK   BOMET  KAPL… M      A     8       M/CYCL… 8/27…
## # … with 11 more rows, 2 more variables: Hour <chr>, OWNER <chr>, and
## #   abbreviated variable names ¹TIME.24.HOURS, ²MV.INVOLVED, ³BASE_SUB_BASE,
## #   ⁴CAUSE.CODE, ⁵VICTIM_TYPE

#Check duplicated records in terms of cause code, MV involved and base
Test7 <- Test6 %>% distinct(CAUSE.CODE, MV.INVOLVED, BASE_SUB_BASE, .keep_all = TRUE)
#CHECK IF THEY HAVE BEEN REMOVED
nrow(Test6)

## [1] 721

nrow(Test7)

## [1] 700

#rows have been removed

rename column names that have spaces to have underscores instead of spaces

Notes: We clean column names a final time and name the data HDX clean

HDX_clean <- Test7 %>%
  rename(TIME_24_HOURS = `TIME.24.HOURS`, MV_INVOLVED = `MV.INVOLVED`, CAUSE_CODE = `CAUSE.CODE`)
#check all column names
colnames(HDX_clean)

##  [1] "Month"         "TIME_24_HOURS" "MV_INVOLVED"   "BASE_SUB_BASE"
##  [5] "COUNTY"        "ROAD"          "GENDER"        "AGE"          
##  [9] "CAUSE_CODE"    "VICTIM_TYPE"   "Date"          "Hour"         
## [13] "OWNER"

DATA ANALYSIS AND VISUALIZATION

Calculate number of accidents’ across months; Plot static/interactive graph showing this

Notes: We get frequency of accidents per month then use th plotly function to come up with an interactive graph. The frequency is denoted with n

accidents_by_month <- HDX_clean %>% count(Month)
# Print the result
print(accidents_by_month)

## # A tibble: 10 × 2
##    Month     n
##    <dbl> <int>
##  1     2    13
##  2     3   139
##  3     4    12
##  4     5    69
##  5     6    48
##  6     7   158
##  7     8   121
##  8     9    59
##  9    10    59
## 10    11    22

# Create an interactive bar plot of accidents by month
plot_ly(data = accidents_by_month, x = ~Month, y = ~n, type = "bar", 
        name = "Number of accidents", text = ~n) %>%
  layout(xaxis = list(title = "Month"), yaxis = list(title = "Number of accidents"), 
         title = "Accidents by month")

Identify most frequent accident type; Plot static/interactive graph showing this

Notes: We first create a subset table called accident types. Then we find the most frequent accident using the which max function. Finally we plot an interactive graph

# Create a frequency table of the CAUSE.CODE variable
accident_types <- table(HDX_clean$CAUSE_CODE)
#See frequency table
View(accident_types)
# Find the most frequent accident type
most_frequent_accident <- names(accident_types)[which.max(accident_types)]
#view most frequent accident type
most_frequent_accident

## [1] "98"

# Print the most frequent accident type
cat("The most frequent accident type is", most_frequent_accident)

## The most frequent accident type is 98

# Create a bar chart using plot_ly
plot_ly(x = names(accident_types), y = accident_types, type = 'bar', marker = list(color = '#1f77b4')) %>%
  layout(xaxis = list(title = "Accident type"), 
         yaxis = list(title = "Frequency"),
         title = "Accident types",
         showlegend = FALSE) %>%
  add_text(x = names(accident_types), y = accident_types, text = accident_types, textposition = "top", textfont = list(size = 14))

## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...

Summary of affected gender by accident type; Plot static/interactive graph showing this

Notes: A lot to unpack here; First i converted the to GENDER column to factors to enable math calculations. Then created a subset data set of only male and female genders This column was unclean and I was not sure on what exactly some letters meant so I opted to eliminate these records for this subset. This reduced sample size which is quite dangerous. But it was not a big chop to affect our results so we’re good. Finally I generated a column that disaggregates the M and F in order to allow for plotting

#convert to character to enable mathematical calculations on character column type
HDX_clean$GENDER <- as.factor(HDX_clean$GENDER)

# Subset data for only male and female genders
#first we get the male and female columns
gender_by_accident_mf <- HDX_clean %>%
  filter(GENDER %in% c("M", "F")) %>%
  group_by(CAUSE_CODE, GENDER) %>%
  summarize(n = n()) %>%
  ungroup() %>%
  pivot_wider(names_from = GENDER, values_from = n, values_fill = 0) %>%
  mutate(Total = M + F)

## `summarise()` has grouped output by 'CAUSE_CODE'. You can override using the
## `.groups` argument.

#then we generate a column that disaggregates the M and F in order to allow for plotting
gender_by_accident_mf_long <- gender_by_accident_mf %>%
  pivot_longer(cols = c("M", "F"), names_to = "Gender", values_to = "n")
# View the resulting data frame
gender_by_accident_mf_long

## # A tibble: 128 × 4
##    CAUSE_CODE Total Gender     n
##    <chr>      <int> <chr>  <int>
##  1 #N/A          19 M         16
##  2 #N/A          19 F          3
##  3 10            57 M         49
##  4 10            57 F          8
##  5 11             7 M          6
##  6 11             7 F          1
##  7 12             1 M          1
##  8 12             1 F          0
##  9 13             1 M          1
## 10 13             1 F          0
## # … with 118 more rows

# finally we Plot a graph to show accident type disaggregated by gender
ggplot(gender_by_accident_mf_long, aes(x = n, y = CAUSE_CODE, fill = Gender)) +
  geom_col(position = "dodge") +
  geom_text(aes(label = ifelse(n > 0, n, "")), position = position_dodge(width = 0.9), vjust = -0.5)+
  xlab("Number of Accidents") +
  ylab("Accident Type") +
  ggtitle("Gender by Accident Type (Male and Female Only)") +
  scale_fill_manual(values = c("pink","blue"))

Which are the top 10 Accident hotspot counties; Plot static/interactive graph showing this

Notes: The Head function specifies the number of rows

top_counties <- HDX_clean %>%
  group_by(COUNTY) %>%
  summarize(n = n()) %>%
  arrange(desc(n)) %>%
  head(10)
#view top counties
top_counties

## # A tibble: 10 × 2
##    COUNTY       n
##    <chr>    <int>
##  1 NAIROBI    109
##  2 KIAMBU      63
##  3 NAKURU      43
##  4 MACHAKOS    39
##  5 MAKUENI     30
##  6 KAKAMEGA    29
##  7 NYERI       28
##  8 KERICHO     21
##  9 KISUMU      21
## 10 EMBU        19

#USING AN INTERACTIVE PLOT
plot_ly(top_counties, x = ~ COUNTY , y = ~n, type = "bar", text = ~n) %>%
  layout(xaxis = list(title = "County Name"),
         yaxis = list(title = "Number of Accidents"),
         title = "Top 10 Accident Hotspot Counties")

# Subset the data to have data for Nairobi metropolitan counties only (Nairobi,Kiambu, Machakos, Kajiado,Muranga) and call it “NMS_data”

Notes: Filter function comes into play

NMS_data<-filter(HDX_clean, COUNTY == 'NAIROBI' |
                    COUNTY == 'MACHAKOS' |
                    COUNTY == 'KIAMBU' |
                    COUNTY == 'KAJIADO' |
                    COUNTY == 'MURANGA' )


head(NMS_data)

## # A tibble: 6 × 13
##   Month TIME_2…¹ MV_IN…² BASE_…³ COUNTY ROAD  GENDER AGE   CAUSE…⁴ VICTI…⁵ Date 
##   <dbl> <chr>    <chr>   <chr>   <chr>  <chr> <fct>  <chr> <chr>   <chr>   <chr>
## 1     3 0930     KYX 55… KANDARA MURAN… MAKE… M      A     37      RIDER … 3/21…
## 2     3 1400     UNREGI… DTEO G… NAIRO… PROF… M      A     36      RIDER … 3/21…
## 3     3 2030     KBB 71… DTEO I… NAIRO… LIKO… M      A     39      RIDER   3/9/…
## 4     3 1200     KBJ 68… JUJA    KIAMBU GATU… M      A     8       RIDER   3/18…
## 5     3 1500     KBS 96… THIKA   KIAMBU NYER… M      40    57      RIDER   3/21…
## 6     3 0710     M/V KB… DTEO G… NAIRO… LOWE… M      39    8       RIDER   3/25…
## # … with 2 more variables: Hour <chr>, OWNER <chr>, and abbreviated variable
## #   names ¹TIME_24_HOURS, ²MV_INVOLVED, ³BASE_SUB_BASE, ⁴CAUSE_CODE,
## #   ⁵VICTIM_TYPE

Using NMS_data find out which county has experienced the highest number of accidents; Plot static/interactive graph showing this

Notes: Take note of the group by function that enables summaries of frequencies

#Steps vs Calories
highest_accidents <- NMS_data%>%
  group_by(COUNTY) %>%
  summarize(accidents = n()) %>%
  arrange(desc(accidents))%>%
  ungroup() 
  
highest_accidents

## # A tibble: 5 × 2
##   COUNTY   accidents
##   <chr>        <int>
## 1 NAIROBI        109
## 2 KIAMBU          63
## 3 MACHAKOS        39
## 4 KAJIADO         19
## 5 MURANGA         14

ggplot(highest_accidents, aes(x = reorder(COUNTY, accidents), y = accidents)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  xlab("County") +
  ylab("Number of Accidents") +
  ggtitle("Top Metropolitan Counties by Number of Accidents") +
  geom_text(aes(label = accidents), position = position_dodge(width = 0.9), vjust = -0.5)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

# Print the most frequent accident type
top<-head(highest_accidents,1)
cat("The County with the most accidents is", top$COUNTY)

## The County with the most accidents is NAIROBI

# Using NMS_data find out which month do accidents occurs more rampantly and the counties; Plot static/interactive graph showing this

Notes: Position dodge helps make it a group bar chart for easy summary on the visual. The scale_x_discrete helps list all months on the x scale

accidents_by_month_county <- NMS_data %>%
  group_by(Month, COUNTY) %>%
  summarize(accidents = n()) %>%
  ungroup()

## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.

#view accidents
accidents_by_month_county

## # A tibble: 42 × 3
##    Month COUNTY   accidents
##    <dbl> <chr>        <int>
##  1     2 KIAMBU           2
##  2     2 MACHAKOS         1
##  3     2 MURANGA          2
##  4     2 NAIROBI          2
##  5     3 KAJIADO          3
##  6     3 KIAMBU          18
##  7     3 MACHAKOS         8
##  8     3 MURANGA          2
##  9     3 NAIROBI         18
## 10     4 KIAMBU           1
## # … with 32 more rows

#plot
ggplot(accidents_by_month_county, aes(x = Month, y = accidents, fill = COUNTY)) +
  geom_bar(stat = "identity", position = "dodge") +
  xlab("Month") +
  ylab("Number of Accidents") +
  geom_text(aes(label=accidents), position=position_dodge(width=0.9), vjust=-0.5) +
  ggtitle("Accidents by Month and County")+
  scale_x_discrete(limits = unique(accidents_by_month_county$Month))

## Warning: Continuous limits supplied to discrete scale.
## ℹ Did you mean `limits = factor(...)` or `scale_*_continuous()`?