Notes:The following packages were used(description shown)
library(tidyverse)#For data cleaning and organizing
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)#For plotting
library(knitr)#For use with rmarkdown in presentation
library(lubridate)#For date use and conversion
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(rmarkdown)#For creating a sharable document
library(janitor)#For easing data cleaning
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(skimr)#For easy summary
library(plotly)#For interactive plots
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
Notes:Three data sets were imported for this analysis
#used only two data sets that covered 24 hours(initially)
#added weight data set to get context
Test <- read_csv("C:\\Users\\AAH\\Downloads\\Data_intern_task\\kenya-accidents-database.csv")
## Rows: 740 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): TIME 24 HOURS, BASE/SUB BASE, COUNTY, ROAD, PLACE, MV INVOLVED, BR...
## dbl (1): Month
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(Test)
## [1] "TIME 24 HOURS" "BASE/SUB BASE" "COUNTY"
## [4] "ROAD" "PLACE" "MV INVOLVED"
## [7] "BRIEF ACCIDENT DETAILS" "NAME OF VICTIM" "GENDER"
## [10] "AGE" "CAUSE CODE" "VICTIM"
## [13] "NO." "Date DD/MM/YYYY" "Hour"
## [16] "Month"
colnames(Test) <- make.names(colnames(Test))
Notes: Just the first 6 rows & then look at the structure
View(Test)
dim(Test)
## [1] 740 16
summary(Test)
## TIME.24.HOURS BASE.SUB.BASE COUNTY ROAD
## Length:740 Length:740 Length:740 Length:740
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## PLACE MV.INVOLVED BRIEF.ACCIDENT.DETAILS
## Length:740 Length:740 Length:740
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## NAME.OF.VICTIM GENDER AGE CAUSE.CODE
## Length:740 Length:740 Length:740 Length:740
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## VICTIM NO. Date.DD.MM.YYYY Hour
## Length:740 Length:740 Length:740 Length:740
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Month
## Min. : 2.000
## 1st Qu.: 5.000
## Median : 7.000
## Mean : 6.557
## 3rd Qu.: 8.000
## Max. :11.000
colnames(Test)
## [1] "TIME.24.HOURS" "BASE.SUB.BASE" "COUNTY"
## [4] "ROAD" "PLACE" "MV.INVOLVED"
## [7] "BRIEF.ACCIDENT.DETAILS" "NAME.OF.VICTIM" "GENDER"
## [10] "AGE" "CAUSE.CODE" "VICTIM"
## [13] "NO." "Date.DD.MM.YYYY" "Hour"
## [16] "Month"
Notes: Check is done by a simple colnames function
Test$OWNER <- "HDX"
#Check if names have changed
colnames(Test)
## [1] "TIME.24.HOURS" "BASE.SUB.BASE" "COUNTY"
## [4] "ROAD" "PLACE" "MV.INVOLVED"
## [7] "BRIEF.ACCIDENT.DETAILS" "NAME.OF.VICTIM" "GENDER"
## [10] "AGE" "CAUSE.CODE" "VICTIM"
## [13] "NO." "Date.DD.MM.YYYY" "Hour"
## [16] "Month" "OWNER"
Notes: piping makes it easy to combine functions. You’ll see a lot of piping as we go along
Test2 <- Test %>%
rename(Date = `Date.DD.MM.YYYY`, VICTIM_TYPE = VICTIM, BASE_SUB_BASE = `BASE.SUB.BASE`)
# View updated dataframe
colnames(Test2)
## [1] "TIME.24.HOURS" "BASE_SUB_BASE" "COUNTY"
## [4] "ROAD" "PLACE" "MV.INVOLVED"
## [7] "BRIEF.ACCIDENT.DETAILS" "NAME.OF.VICTIM" "GENDER"
## [10] "AGE" "CAUSE.CODE" "VICTIM_TYPE"
## [13] "NO." "Date" "Hour"
## [16] "Month" "OWNER"
Test2
## # A tibble: 740 × 17
## TIME.24.HOURS BASE_…¹ COUNTY ROAD PLACE MV.IN…² BRIEF…³ NAME.…⁴ GENDER AGE
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 0139 DTEO D… NAIRO… WAIY… KANG… KBR 74… VEHICL… UNKNOWN M A
## 2 0845 SAGANA MURAN… SAGA… KAIN… KBY 06… VEHICL… DICKSO… M 61
## 3 0930 MOGOTIO BARIN… EMIN… OTER… UNKNOWN VEHICL… RODGER… M 31
## 4 1250 NAKURU NAKURU NAIR… MARU… KBM 53… THE VE… ISAAC … M A
## 5 1730 KURIA MIGORI NTIM… TARA… GK B 5… VEHICL… CHARLE… M J
## 6 1930 SAGANA KIRIN… KENO… WACH… KBU 59… THE CY… UNKNOWN M A
## 7 2039 KANGUN… MAKUE… NAIR… MAKE… KCC 37… THE VE… BENARD… M 36
## 8 2145 DTEO K… NAIRO… THIK… CAR … KCH 35… VEHICL… COLLIN… M 20
## 9 0730 MURANGA MURAN… GAKO… KAMA… KCJ 92… THE VE… UNKNOWN M A
## 10 0930 THIKA KIAMBU THIK… KAND… KBQ 57… VEHICL… FREDRI… M A
## # … with 730 more rows, 7 more variables: CAUSE.CODE <chr>, VICTIM_TYPE <chr>,
## # NO. <chr>, Date <chr>, Hour <chr>, Month <dbl>, OWNER <chr>, and
## # abbreviated variable names ¹BASE_SUB_BASE, ²MV.INVOLVED,
## # ³BRIEF.ACCIDENT.DETAILS, ⁴NAME.OF.VICTIM
Notes: The Select function will enable this. The “-c” translates to remove columns
Test3<- Test2 %>%
select(-c("NAME.OF.VICTIM", "PLACE", "NO.", "BRIEF.ACCIDENT.DETAILS"))
# View updated dataframe
colnames(Test3)
## [1] "TIME.24.HOURS" "BASE_SUB_BASE" "COUNTY" "ROAD"
## [5] "MV.INVOLVED" "GENDER" "AGE" "CAUSE.CODE"
## [9] "VICTIM_TYPE" "Date" "Hour" "Month"
## [13] "OWNER"
Test3
## # A tibble: 740 × 13
## TIME.…¹ BASE_…² COUNTY ROAD MV.IN…³ GENDER AGE CAUSE…⁴ VICTI…⁵ Date Hour
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 0139 DTEO D… NAIRO… WAIY… KBR 74… M A 98 PEDEST… 2/22… 1
## 2 0845 SAGANA MURAN… SAGA… KBY 06… M 61 29 P/CYCL… 2/22… 8
## 3 0930 MOGOTIO BARIN… EMIN… UNKNOWN M 31 30 PEDEST… 2/22… 9
## 4 1250 NAKURU NAKURU NAIR… KBM 53… M A 37 DRIVER 2/22… 12
## 5 1730 KURIA MIGORI NTIM… GK B 5… M J 36 PEDEST… 2/22… 17
## 6 1930 SAGANA KIRIN… KENO… KBU 59… M A 37 RIDER … 2/22… 19
## 7 2039 KANGUN… MAKUE… NAIR… KCC 37… M 36 82 RIDER 2/22… 20
## 8 2145 DTEO K… NAIRO… THIK… KCH 35… M 20 98 PEDEST… 2/22… 21
## 9 0730 MURANGA MURAN… GAKO… KCJ 92… M A 26 PASSEN… 2/28… 7
## 10 0930 THIKA KIAMBU THIK… KBQ 57… M A 10 P/CYCL… 2/28… 9
## # … with 730 more rows, 2 more variables: Month <dbl>, OWNER <chr>, and
## # abbreviated variable names ¹TIME.24.HOURS, ²BASE_SUB_BASE, ³MV.INVOLVED,
## # ⁴CAUSE.CODE, ⁵VICTIM_TYPE
Notes: Here we first get a view of the column, then filter out unwanted recors from the column. Next we substitute the records with HRS after them with nothing and finally confirm if the data set is clean enough
Test3$TIME.24.HOURS
## [1] "0139" "0845" "0930" "1250" "1730"
## [6] "1930" "2039" "2145" "0730" "0930"
## [11] "1230" "1850" "2030" "0730" "0800"
## [16] "1000" "1800" "1800" "2230" "0600"
## [21] "0730" "1205" "1230" "1230" "1600"
## [26] "1630" "1830" "2045" "2200" "0300"
## [31] "1220" "1430" "1530" "1730" "2030"
## [36] "0630" "0730" "0930" "1400" "1500"
## [41] "2030" "2030" "2030" "3000" "0130"
## [46] "0230" "0700" "1030" "1145" "1610"
## [51] "1800" "2130" "3000" "0840" "1720"
## [56] "1800" "2045" "0840" "1930" "1930"
## [61] "1600" "1830" "2000" "2000" "2050"
## [66] "2230" "1100" "2100" "0330" "0330"
## [71] "1130" "1430" "2100" "0610" "1200"
## [76] "1500" "1900" "2030" "2120" "0330"
## [81] "0430" "0445" "0630" "0745" "1230"
## [86] "1530" "1800" "1830" "2205" "2215"
## [91] "1530" "1930" "2030" "3000" "0615"
## [96] "0930" "0930" "1330" "1400" "1500"
## [101] "1540" "1600" "1700" "1900" "2115"
## [106] "1400" "1845" "0730" "0730" "0930"
## [111] "1330" "1830" "0220HRS" "0600HRS" "0810"
## [116] "1230" "1445" "1530HRS" "1730HRS" "1830HRS"
## [121] "845HRS" "0710" "1350" "1620" "1945"
## [126] "NAROK" "0630" "0630" "1820" "2100"
## [131] "0500hrs" "1100" "1550" "1700" "1725"
## [136] "1930" "2230" "0045 HRS" "0600 HRS" "0700 HRS"
## [141] "1800" "1800" "0835" "1030" "1100"
## [146] "1600" "1630" "1900" "UNKNOWN TIME" "0630"
## [151] "1030" "1500" "1900" "1930" "2100"
## [156] "2100" "2330" "0900" "1200" "1430"
## [161] "1438" "1645" "1700" "1845" "1930"
## [166] "2030" "2100" "2110" "4000" "0630"
## [171] "1340" "1610" "1800" "1856" "UNKNOWN"
## [176] "0715" "1330" "2030" "0230" "0730"
## [181] "1130" "1700" "1955" "2045" "2230"
## [186] "2345" "0230" "0900" "1540" "1930"
## [191] "2000" "2000" "2320" "1000" "1030"
## [196] "1730" "1730" "1750" "1900" "2100"
## [201] "2245" "2330" "1100" "1200" "2330"
## [206] "1030" "2000" "0630" "0730" "1100"
## [211] "1830" "1900" "1940" "2030" "2130"
## [216] "0600" "1230" "1410" "1700" "1800"
## [221] "1945" "2115" "0130" "0530" "1200"
## [226] "1645" "1700" "1720" "2140" "0530"
## [231] "0530" "1140" "1230" "1600" "1745"
## [236] "2100" "1000" "1140" "1730" "0210"
## [241] "1100" "1300" "1330" "1600" "1630"
## [246] "1900" "1930" "2040" "UNKNOWN" "0220"
## [251] "0645" "0900" "2015" "2100" "0400"
## [256] "1550" "1900" "2030" "2230" "1330"
## [261] "1600" "1830" "1900" "1900" "1900"
## [266] "2015" "2030" "1600" "1700" "1900"
## [271] "2000" "2030" "1200" "2015" "2015"
## [276] "2100" "2200" "0500" "1200" "1430"
## [281] "1630" "1730" "3000" "1230" "1500"
## [286] "2300" "0820" "1500" "2330" "4500"
## [291] "0500" "0700" "0730" "0730" "2000"
## [296] "2100" "2200" "0730" "1000" "1720"
## [301] "1945" "2000" "2000" "2145" "2230"
## [306] "0730" "1100" "1730" "2130" "0430"
## [311] "1030" "2030" "2230" "1720" "2030"
## [316] "2240" "2240" "2300" "1740" "1800"
## [321] "0700" "1230" "1645" "1750" "2000"
## [326] "0400" "0700" "1130" "1730" "1900"
## [331] "1940" "2030" "2040" "2200" "2330"
## [336] "2330" "0800" "1120" "1350" "1730"
## [341] "1730" "1800" "2120" "2145" "2155"
## [346] "1000" "2030" "2100" "0600" "1400"
## [351] "1520" "2000" "2130" "2330" "2345"
## [356] "0100" "2140" "0200" "1030" "1300"
## [361] "1600" "1600" "1830" "1945" "1130"
## [366] "1530" "1700" "1730" "2000" "0400"
## [371] "1730" "1900" "2020" "2100" "2100"
## [376] "2130" "2345" "3000" "1500" "1500"
## [381] "1600" "1600" "1830" "2000" "2030"
## [386] "2050" "0215" "1050" "1800" "1800"
## [391] "1830" "2030" "2100" "2300" "2330"
## [396] "0800" "1815" "1930" "2030" "2230"
## [401] "2230" "0230" "0600" "0830" "1530"
## [406] "1730" "2000" "UNKNOWN" "UNKNOWN" "UNKNOWN"
## [411] "0230" "0730" "1445" "1540" "1710"
## [416] "1830" "2200" "0700" "1100" "1130"
## [421] "1810" "2050" "0240" "0530" "0630"
## [426] "1130" "1500" "1500" "1500" "1520"
## [431] "1840" "2245" "0630" "1050" "1200"
## [436] "1750" "2000" "0600" "1400" "1730"
## [441] "1830" "1845" "1900" "1900" "2050"
## [446] "2230" "0845" "1130" "1630" "1700"
## [451] "1930" "2000" "2100" "2145" "UNKNOWN TIME"
## [456] "UNKNOWN TIME" "UNKNOWN TIME" "UNKNOWN TIME" "1700" "0930"
## [461] "1045" "1400" "1430" "2000" "2010"
## [466] "UNKNOWN" "0600" "0830" "1250" "1330"
## [471] "1530" "1830" "2200" "0320" "1400"
## [476] "1800" "1900" "2030" "2140" "0700"
## [481] "1000" "1630" "1700" "2120" "2130"
## [486] "2300" "3000" "0400" "0630" "1230"
## [491] "1800" "2230" "2320" "0545" "0620"
## [496] "1300" "1800" "1830" "2200" "2200"
## [501] "1630" "1730" "1740" "1830" "1830"
## [506] "2300" "0810" "0900" "2130" "1730"
## [511] "0230" "0430" "1000" "1900" "2000"
## [516] "2000" "2230" "2300" "2300" "2330"
## [521] "4000" "#N/A" "#N/A" "0440" "1200"
## [526] "1430" "1430" "2100" "#N/A" "1630"
## [531] "1745" "2030" "3000" "5000" "1800"
## [536] "1930" "1150" "2200" "1840" "2100"
## [541] "2130" "0700" "0700" "0715" "1200"
## [546] "1630" "0800" "0930" "1030" "1100"
## [551] "1400" "1720" "0700" "0830" "1440"
## [556] "1645" "0930" "1240" "1440" "0940"
## [561] "1130" "1150" "1230" "1600" "1600"
## [566] "1630" "1630" "1700" "1740" "1810"
## [571] "1845" "1900" "1930" "1945" "2000"
## [576] "2000" "2130" "2230" "2300" "2300"
## [581] "2300" "0600" "0630" "1030" "1320"
## [586] "1820" "2055" "1420" "1440" "0100"
## [591] "0130" "0400" "0700" "1000" "1230"
## [596] "1945" "2200" "2315" "UNKNOWN TIME" "1530"
## [601] "1530" "1900" "UNKNOWN TIME" "0640" "1100"
## [606] "1345" "2000" "2010" "1545" "1700"
## [611] "1945" "2000" "2230" "1200" "1330"
## [616] "1330" "1730" "2340" "5" "0130"
## [621] "0430" "0630" "0630" "1640" "1645"
## [626] "2030" "2130" "2200" "2345" "3000"
## [631] "5" "5" "0430" "0600" "1630"
## [636] "2050" "2130" "2230" "0720" "1630"
## [641] "1800" "1820" "1900" "2000" "2030"
## [646] "2040" "0345" "1430" "1530" "1600"
## [651] "1700" "2030" "2140" "2210" "UNKNOWN"
## [656] "0600" "0700" "1000" "2200" "2245"
## [661] "0730" "1800" "2000" "2040" "0800"
## [666] "0900" "1350" "2000" "2130" "0630"
## [671] "1030" "1210" "1400" "1000" "1000"
## [676] "1030" "1330" "1400" "1600" "1640"
## [681] "1830" "1900" "2000" "2140" "2230"
## [686] "UNKNOWN TIME" "0700" "1200" "1200" "1330"
## [691] "1330" "2000" "2030" "1000" "1630"
## [696] "1630" "1920" "0600" "1130" "1440"
## [701] "1640" "1920" "2030" "1630" "2000"
## [706] "2015" "2025" "2030" "2045" "2130"
## [711] "2230" "0430" "1300" "1530" "1630"
## [716] "1745" "1940" "1030" "1300" "1730"
## [721] "2000" "2030" "0130" "0630" "0745"
## [726] "1030" "1100" "1330" "1730" "2136"
## [731] "0745" "1430" "1515" "1600" "1745"
## [736] "1747" "1830" "1930" "2030" "2300"
Test4 <- Test3 %>%
filter(TIME.24.HOURS != "UNKNOWN TIME")%>%
filter(TIME.24.HOURS != "UNKNOWN")%>%
filter(TIME.24.HOURS != "#N/A")%>%
filter(TIME.24.HOURS != "NAROK")
# Remove "hrs" from time records
Test4$TIME.24.HOURS <- gsub("HRS", "", Test4$TIME.24.HOURS)
# View updated dataframe
colnames(Test4)
## [1] "TIME.24.HOURS" "BASE_SUB_BASE" "COUNTY" "ROAD"
## [5] "MV.INVOLVED" "GENDER" "AGE" "CAUSE.CODE"
## [9] "VICTIM_TYPE" "Date" "Hour" "Month"
## [13] "OWNER"
# Check if time column is clean
Test4$TIME.24.HOURS
## [1] "0139" "0845" "0930" "1250" "1730" "1930" "2039"
## [8] "2145" "0730" "0930" "1230" "1850" "2030" "0730"
## [15] "0800" "1000" "1800" "1800" "2230" "0600" "0730"
## [22] "1205" "1230" "1230" "1600" "1630" "1830" "2045"
## [29] "2200" "0300" "1220" "1430" "1530" "1730" "2030"
## [36] "0630" "0730" "0930" "1400" "1500" "2030" "2030"
## [43] "2030" "3000" "0130" "0230" "0700" "1030" "1145"
## [50] "1610" "1800" "2130" "3000" "0840" "1720" "1800"
## [57] "2045" "0840" "1930" "1930" "1600" "1830" "2000"
## [64] "2000" "2050" "2230" "1100" "2100" "0330" "0330"
## [71] "1130" "1430" "2100" "0610" "1200" "1500" "1900"
## [78] "2030" "2120" "0330" "0430" "0445" "0630" "0745"
## [85] "1230" "1530" "1800" "1830" "2205" "2215" "1530"
## [92] "1930" "2030" "3000" "0615" "0930" "0930" "1330"
## [99] "1400" "1500" "1540" "1600" "1700" "1900" "2115"
## [106] "1400" "1845" "0730" "0730" "0930" "1330" "1830"
## [113] "0220" "0600" "0810" "1230" "1445" "1530" "1730"
## [120] "1830" "845" "0710" "1350" "1620" "1945" "0630"
## [127] "0630" "1820" "2100" "0500hrs" "1100" "1550" "1700"
## [134] "1725" "1930" "2230" "0045 " "0600 " "0700 " "1800"
## [141] "1800" "0835" "1030" "1100" "1600" "1630" "1900"
## [148] "0630" "1030" "1500" "1900" "1930" "2100" "2100"
## [155] "2330" "0900" "1200" "1430" "1438" "1645" "1700"
## [162] "1845" "1930" "2030" "2100" "2110" "4000" "0630"
## [169] "1340" "1610" "1800" "1856" "0715" "1330" "2030"
## [176] "0230" "0730" "1130" "1700" "1955" "2045" "2230"
## [183] "2345" "0230" "0900" "1540" "1930" "2000" "2000"
## [190] "2320" "1000" "1030" "1730" "1730" "1750" "1900"
## [197] "2100" "2245" "2330" "1100" "1200" "2330" "1030"
## [204] "2000" "0630" "0730" "1100" "1830" "1900" "1940"
## [211] "2030" "2130" "0600" "1230" "1410" "1700" "1800"
## [218] "1945" "2115" "0130" "0530" "1200" "1645" "1700"
## [225] "1720" "2140" "0530" "0530" "1140" "1230" "1600"
## [232] "1745" "2100" "1000" "1140" "1730" "0210" "1100"
## [239] "1300" "1330" "1600" "1630" "1900" "1930" "2040"
## [246] "0220" "0645" "0900" "2015" "2100" "0400" "1550"
## [253] "1900" "2030" "2230" "1330" "1600" "1830" "1900"
## [260] "1900" "1900" "2015" "2030" "1600" "1700" "1900"
## [267] "2000" "2030" "1200" "2015" "2015" "2100" "2200"
## [274] "0500" "1200" "1430" "1630" "1730" "3000" "1230"
## [281] "1500" "2300" "0820" "1500" "2330" "4500" "0500"
## [288] "0700" "0730" "0730" "2000" "2100" "2200" "0730"
## [295] "1000" "1720" "1945" "2000" "2000" "2145" "2230"
## [302] "0730" "1100" "1730" "2130" "0430" "1030" "2030"
## [309] "2230" "1720" "2030" "2240" "2240" "2300" "1740"
## [316] "1800" "0700" "1230" "1645" "1750" "2000" "0400"
## [323] "0700" "1130" "1730" "1900" "1940" "2030" "2040"
## [330] "2200" "2330" "2330" "0800" "1120" "1350" "1730"
## [337] "1730" "1800" "2120" "2145" "2155" "1000" "2030"
## [344] "2100" "0600" "1400" "1520" "2000" "2130" "2330"
## [351] "2345" "0100" "2140" "0200" "1030" "1300" "1600"
## [358] "1600" "1830" "1945" "1130" "1530" "1700" "1730"
## [365] "2000" "0400" "1730" "1900" "2020" "2100" "2100"
## [372] "2130" "2345" "3000" "1500" "1500" "1600" "1600"
## [379] "1830" "2000" "2030" "2050" "0215" "1050" "1800"
## [386] "1800" "1830" "2030" "2100" "2300" "2330" "0800"
## [393] "1815" "1930" "2030" "2230" "2230" "0230" "0600"
## [400] "0830" "1530" "1730" "2000" "0230" "0730" "1445"
## [407] "1540" "1710" "1830" "2200" "0700" "1100" "1130"
## [414] "1810" "2050" "0240" "0530" "0630" "1130" "1500"
## [421] "1500" "1500" "1520" "1840" "2245" "0630" "1050"
## [428] "1200" "1750" "2000" "0600" "1400" "1730" "1830"
## [435] "1845" "1900" "1900" "2050" "2230" "0845" "1130"
## [442] "1630" "1700" "1930" "2000" "2100" "2145" "1700"
## [449] "0930" "1045" "1400" "1430" "2000" "2010" "0600"
## [456] "0830" "1250" "1330" "1530" "1830" "2200" "0320"
## [463] "1400" "1800" "1900" "2030" "2140" "0700" "1000"
## [470] "1630" "1700" "2120" "2130" "2300" "3000" "0400"
## [477] "0630" "1230" "1800" "2230" "2320" "0545" "0620"
## [484] "1300" "1800" "1830" "2200" "2200" "1630" "1730"
## [491] "1740" "1830" "1830" "2300" "0810" "0900" "2130"
## [498] "1730" "0230" "0430" "1000" "1900" "2000" "2000"
## [505] "2230" "2300" "2300" "2330" "4000" "0440" "1200"
## [512] "1430" "1430" "2100" "1630" "1745" "2030" "3000"
## [519] "5000" "1800" "1930" "1150" "2200" "1840" "2100"
## [526] "2130" "0700" "0700" "0715" "1200" "1630" "0800"
## [533] "0930" "1030" "1100" "1400" "1720" "0700" "0830"
## [540] "1440" "1645" "0930" "1240" "1440" "0940" "1130"
## [547] "1150" "1230" "1600" "1600" "1630" "1630" "1700"
## [554] "1740" "1810" "1845" "1900" "1930" "1945" "2000"
## [561] "2000" "2130" "2230" "2300" "2300" "2300" "0600"
## [568] "0630" "1030" "1320" "1820" "2055" "1420" "1440"
## [575] "0100" "0130" "0400" "0700" "1000" "1230" "1945"
## [582] "2200" "2315" "1530" "1530" "1900" "0640" "1100"
## [589] "1345" "2000" "2010" "1545" "1700" "1945" "2000"
## [596] "2230" "1200" "1330" "1330" "1730" "2340" "5"
## [603] "0130" "0430" "0630" "0630" "1640" "1645" "2030"
## [610] "2130" "2200" "2345" "3000" "5" "5" "0430"
## [617] "0600" "1630" "2050" "2130" "2230" "0720" "1630"
## [624] "1800" "1820" "1900" "2000" "2030" "2040" "0345"
## [631] "1430" "1530" "1600" "1700" "2030" "2140" "2210"
## [638] "0600" "0700" "1000" "2200" "2245" "0730" "1800"
## [645] "2000" "2040" "0800" "0900" "1350" "2000" "2130"
## [652] "0630" "1030" "1210" "1400" "1000" "1000" "1030"
## [659] "1330" "1400" "1600" "1640" "1830" "1900" "2000"
## [666] "2140" "2230" "0700" "1200" "1200" "1330" "1330"
## [673] "2000" "2030" "1000" "1630" "1630" "1920" "0600"
## [680] "1130" "1440" "1640" "1920" "2030" "1630" "2000"
## [687] "2015" "2025" "2030" "2045" "2130" "2230" "0430"
## [694] "1300" "1530" "1630" "1745" "1940" "1030" "1300"
## [701] "1730" "2000" "2030" "0130" "0630" "0745" "1030"
## [708] "1100" "1330" "1730" "2136" "0745" "1430" "1515"
## [715] "1600" "1745" "1747" "1830" "1930" "2030" "2300"
Notes: We see the Select column come into play again. this time it is used in ordering column. Quite the powerful function as you can see
Test5 <- Test4 %>% select(Month, TIME.24.HOURS, MV.INVOLVED, everything())
# View updated data frame
Test5
## # A tibble: 721 × 13
## Month TIME.…¹ MV.IN…² BASE_…³ COUNTY ROAD GENDER AGE CAUSE…⁴ VICTI…⁵ Date
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2 0139 KBR 74… DTEO D… NAIRO… WAIY… M A 98 PEDEST… 2/22…
## 2 2 0845 KBY 06… SAGANA MURAN… SAGA… M 61 29 P/CYCL… 2/22…
## 3 2 0930 UNKNOWN MOGOTIO BARIN… EMIN… M 31 30 PEDEST… 2/22…
## 4 2 1250 KBM 53… NAKURU NAKURU NAIR… M A 37 DRIVER 2/22…
## 5 2 1730 GK B 5… KURIA MIGORI NTIM… M J 36 PEDEST… 2/22…
## 6 2 1930 KBU 59… SAGANA KIRIN… KENO… M A 37 RIDER … 2/22…
## 7 2 2039 KCC 37… KANGUN… MAKUE… NAIR… M 36 82 RIDER 2/22…
## 8 2 2145 KCH 35… DTEO K… NAIRO… THIK… M 20 98 PEDEST… 2/22…
## 9 2 0730 KCJ 92… MURANGA MURAN… GAKO… M A 26 PASSEN… 2/28…
## 10 2 0930 KBQ 57… THIKA KIAMBU THIK… M A 10 P/CYCL… 2/28…
## # … with 711 more rows, 2 more variables: Hour <chr>, OWNER <chr>, and
## # abbreviated variable names ¹TIME.24.HOURS, ²MV.INVOLVED, ³BASE_SUB_BASE,
## # ⁴CAUSE.CODE, ⁵VICTIM_TYPE
colnames(Test5)
## [1] "Month" "TIME.24.HOURS" "MV.INVOLVED" "BASE_SUB_BASE"
## [5] "COUNTY" "ROAD" "GENDER" "AGE"
## [9] "CAUSE.CODE" "VICTIM_TYPE" "Date" "Hour"
## [13] "OWNER"
Notes: A new desc function is used and your guess is as good as mine in its use
Test6 <- Test5 %>% arrange(desc(VICTIM_TYPE))
# View updated dataframe
Test6
## # A tibble: 721 × 13
## Month TIME.…¹ MV.IN…² BASE_…³ COUNTY ROAD GENDER AGE CAUSE…⁴ VICTI…⁵ Date
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 3 0930 M/CYCL… NANDI … NANDI KABI… M 25&28 37 RIDERS 3/23…
## 2 8 1000 KBU666… NYARI MOMBA… OLD … M A 16 RIDER … 8/4/…
## 3 2 1930 KBU 59… SAGANA KIRIN… KENO… M A 37 RIDER … 2/22…
## 4 3 1500 KCF 03… PAP ON… KISUMU KATI… M&F 42,2… 26 RIDER … 3/18…
## 5 3 0930 KYX 55… KANDARA MURAN… MAKE… M A 37 RIDER … 3/21…
## 6 3 1400 UNREGI… DTEO G… NAIRO… PROF… M A 36 RIDER … 3/21…
## 7 3 1540 KCB 80… SAGANA KIRIN… SAGA… M 21 &… 26 RIDER … 3/21…
## 8 3 1700 KCX 81… AWENDO MIGORI AWEN… M A 50 RIDER … 3/21…
## 9 3 0730 M/CYCL… MUMIAS KAKAM… EKER… M & F… 52 &… 98 RIDER … 3/23…
## 10 2 2039 KCC 37… KANGUN… MAKUE… NAIR… M 36 82 RIDER 2/22…
## # … with 711 more rows, 2 more variables: Hour <chr>, OWNER <chr>, and
## # abbreviated variable names ¹TIME.24.HOURS, ²MV.INVOLVED, ³BASE_SUB_BASE,
## # ⁴CAUSE.CODE, ⁵VICTIM_TYPE
Notes: First we list the duplicated functions in the specified column using the duplicated() function. Next We remove them and confirm this with the nrows function
duplicated_records <- Test6 %>% select(CAUSE.CODE, MV.INVOLVED, BASE_SUB_BASE) %>%
duplicated()
# Print the duplicated records
print(Test6[duplicated_records,])
## # A tibble: 21 × 13
## Month TIME.…¹ MV.IN…² BASE_…³ COUNTY ROAD GENDER AGE CAUSE…⁴ VICTI…⁵ Date
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 10 0700 UNKNOW… MAKUENI MAKUE… MBUN… M 57 98 PEDEST… 10/2…
## 2 11 0630 UNKNOWN EMBAKA… NAIRO… MOMB… M A 98 PEDEST… 11/5…
## 3 7 1600 KBT 33… KISII KISII KISI… M A 26 PASSEN… 7/14…
## 4 3 2030 KMDR 7… MASENO KISUMU KISU… M A #N/A M/CYCL… 3/10…
## 5 7 2100 KBA 18… BUSIA BUSIA BUTU… M A 79 M/CYCL… 7/16…
## 6 7 1500 KCF 27… BONDO SIAYA BOND… M A 10 M/CYCL… 7/19…
## 7 8 2300 KMCL 8… MAGUMU NYAND… FLYO… M A 68 M/CYCL… 8/11…
## 8 8 1630 KBK 66… MAUA MERU MERU… M A 26 M/CYCL… 8/27…
## 9 8 2000 KCJ 68… BUTERE KAKAM… SHIA… M A 8 M/CYCL… 8/27…
## 10 8 2300 KBU 17… SOTIK BOMET KAPL… M A 8 M/CYCL… 8/27…
## # … with 11 more rows, 2 more variables: Hour <chr>, OWNER <chr>, and
## # abbreviated variable names ¹TIME.24.HOURS, ²MV.INVOLVED, ³BASE_SUB_BASE,
## # ⁴CAUSE.CODE, ⁵VICTIM_TYPE
#Check duplicated records in terms of cause code, MV involved and base
Test7 <- Test6 %>% distinct(CAUSE.CODE, MV.INVOLVED, BASE_SUB_BASE, .keep_all = TRUE)
#CHECK IF THEY HAVE BEEN REMOVED
nrow(Test6)
## [1] 721
nrow(Test7)
## [1] 700
#rows have been removed
Notes: We clean column names a final time and name the data HDX clean
HDX_clean <- Test7 %>%
rename(TIME_24_HOURS = `TIME.24.HOURS`, MV_INVOLVED = `MV.INVOLVED`, CAUSE_CODE = `CAUSE.CODE`)
#check all column names
colnames(HDX_clean)
## [1] "Month" "TIME_24_HOURS" "MV_INVOLVED" "BASE_SUB_BASE"
## [5] "COUNTY" "ROAD" "GENDER" "AGE"
## [9] "CAUSE_CODE" "VICTIM_TYPE" "Date" "Hour"
## [13] "OWNER"
Notes: We get frequency of accidents per month then use th plotly function to come up with an interactive graph. The frequency is denoted with n
accidents_by_month <- HDX_clean %>% count(Month)
# Print the result
print(accidents_by_month)
## # A tibble: 10 × 2
## Month n
## <dbl> <int>
## 1 2 13
## 2 3 139
## 3 4 12
## 4 5 69
## 5 6 48
## 6 7 158
## 7 8 121
## 8 9 59
## 9 10 59
## 10 11 22
# Create an interactive bar plot of accidents by month
plot_ly(data = accidents_by_month, x = ~Month, y = ~n, type = "bar",
name = "Number of accidents", text = ~n) %>%
layout(xaxis = list(title = "Month"), yaxis = list(title = "Number of accidents"),
title = "Accidents by month")
Notes: We first create a subset table called accident types. Then we find the most frequent accident using the which max function. Finally we plot an interactive graph
# Create a frequency table of the CAUSE.CODE variable
accident_types <- table(HDX_clean$CAUSE_CODE)
#See frequency table
View(accident_types)
# Find the most frequent accident type
most_frequent_accident <- names(accident_types)[which.max(accident_types)]
#view most frequent accident type
most_frequent_accident
## [1] "98"
# Print the most frequent accident type
cat("The most frequent accident type is", most_frequent_accident)
## The most frequent accident type is 98
# Create a bar chart using plot_ly
plot_ly(x = names(accident_types), y = accident_types, type = 'bar', marker = list(color = '#1f77b4')) %>%
layout(xaxis = list(title = "Accident type"),
yaxis = list(title = "Frequency"),
title = "Accident types",
showlegend = FALSE) %>%
add_text(x = names(accident_types), y = accident_types, text = accident_types, textposition = "top", textfont = list(size = 14))
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
Notes: A lot to unpack here; First i converted the to GENDER column to factors to enable math calculations. Then created a subset data set of only male and female genders This column was unclean and I was not sure on what exactly some letters meant so I opted to eliminate these records for this subset. This reduced sample size which is quite dangerous. But it was not a big chop to affect our results so we’re good. Finally I generated a column that disaggregates the M and F in order to allow for plotting
#convert to character to enable mathematical calculations on character column type
HDX_clean$GENDER <- as.factor(HDX_clean$GENDER)
# Subset data for only male and female genders
#first we get the male and female columns
gender_by_accident_mf <- HDX_clean %>%
filter(GENDER %in% c("M", "F")) %>%
group_by(CAUSE_CODE, GENDER) %>%
summarize(n = n()) %>%
ungroup() %>%
pivot_wider(names_from = GENDER, values_from = n, values_fill = 0) %>%
mutate(Total = M + F)
## `summarise()` has grouped output by 'CAUSE_CODE'. You can override using the
## `.groups` argument.
#then we generate a column that disaggregates the M and F in order to allow for plotting
gender_by_accident_mf_long <- gender_by_accident_mf %>%
pivot_longer(cols = c("M", "F"), names_to = "Gender", values_to = "n")
# View the resulting data frame
gender_by_accident_mf_long
## # A tibble: 128 × 4
## CAUSE_CODE Total Gender n
## <chr> <int> <chr> <int>
## 1 #N/A 19 M 16
## 2 #N/A 19 F 3
## 3 10 57 M 49
## 4 10 57 F 8
## 5 11 7 M 6
## 6 11 7 F 1
## 7 12 1 M 1
## 8 12 1 F 0
## 9 13 1 M 1
## 10 13 1 F 0
## # … with 118 more rows
# finally we Plot a graph to show accident type disaggregated by gender
ggplot(gender_by_accident_mf_long, aes(x = n, y = CAUSE_CODE, fill = Gender)) +
geom_col(position = "dodge") +
geom_text(aes(label = ifelse(n > 0, n, "")), position = position_dodge(width = 0.9), vjust = -0.5)+
xlab("Number of Accidents") +
ylab("Accident Type") +
ggtitle("Gender by Accident Type (Male and Female Only)") +
scale_fill_manual(values = c("pink","blue"))
Notes: The Head function specifies the number of rows
top_counties <- HDX_clean %>%
group_by(COUNTY) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
head(10)
#view top counties
top_counties
## # A tibble: 10 × 2
## COUNTY n
## <chr> <int>
## 1 NAIROBI 109
## 2 KIAMBU 63
## 3 NAKURU 43
## 4 MACHAKOS 39
## 5 MAKUENI 30
## 6 KAKAMEGA 29
## 7 NYERI 28
## 8 KERICHO 21
## 9 KISUMU 21
## 10 EMBU 19
#USING AN INTERACTIVE PLOT
plot_ly(top_counties, x = ~ COUNTY , y = ~n, type = "bar", text = ~n) %>%
layout(xaxis = list(title = "County Name"),
yaxis = list(title = "Number of Accidents"),
title = "Top 10 Accident Hotspot Counties")
Notes: Filter function comes into play
NMS_data<-filter(HDX_clean, COUNTY == 'NAIROBI' |
COUNTY == 'MACHAKOS' |
COUNTY == 'KIAMBU' |
COUNTY == 'KAJIADO' |
COUNTY == 'MURANGA' )
head(NMS_data)
## # A tibble: 6 × 13
## Month TIME_2…¹ MV_IN…² BASE_…³ COUNTY ROAD GENDER AGE CAUSE…⁴ VICTI…⁵ Date
## <dbl> <chr> <chr> <chr> <chr> <chr> <fct> <chr> <chr> <chr> <chr>
## 1 3 0930 KYX 55… KANDARA MURAN… MAKE… M A 37 RIDER … 3/21…
## 2 3 1400 UNREGI… DTEO G… NAIRO… PROF… M A 36 RIDER … 3/21…
## 3 3 2030 KBB 71… DTEO I… NAIRO… LIKO… M A 39 RIDER 3/9/…
## 4 3 1200 KBJ 68… JUJA KIAMBU GATU… M A 8 RIDER 3/18…
## 5 3 1500 KBS 96… THIKA KIAMBU NYER… M 40 57 RIDER 3/21…
## 6 3 0710 M/V KB… DTEO G… NAIRO… LOWE… M 39 8 RIDER 3/25…
## # … with 2 more variables: Hour <chr>, OWNER <chr>, and abbreviated variable
## # names ¹TIME_24_HOURS, ²MV_INVOLVED, ³BASE_SUB_BASE, ⁴CAUSE_CODE,
## # ⁵VICTIM_TYPE
Notes: Take note of the group by function that enables summaries of frequencies
#Steps vs Calories
highest_accidents <- NMS_data%>%
group_by(COUNTY) %>%
summarize(accidents = n()) %>%
arrange(desc(accidents))%>%
ungroup()
highest_accidents
## # A tibble: 5 × 2
## COUNTY accidents
## <chr> <int>
## 1 NAIROBI 109
## 2 KIAMBU 63
## 3 MACHAKOS 39
## 4 KAJIADO 19
## 5 MURANGA 14
ggplot(highest_accidents, aes(x = reorder(COUNTY, accidents), y = accidents)) +
geom_bar(stat = "identity", fill = "steelblue") +
xlab("County") +
ylab("Number of Accidents") +
ggtitle("Top Metropolitan Counties by Number of Accidents") +
geom_text(aes(label = accidents), position = position_dodge(width = 0.9), vjust = -0.5)+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
# Print the most frequent accident type
top<-head(highest_accidents,1)
cat("The County with the most accidents is", top$COUNTY)
## The County with the most accidents is NAIROBI
Notes: Position dodge helps make it a group bar chart for easy summary on the visual. The scale_x_discrete helps list all months on the x scale
accidents_by_month_county <- NMS_data %>%
group_by(Month, COUNTY) %>%
summarize(accidents = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.
#view accidents
accidents_by_month_county
## # A tibble: 42 × 3
## Month COUNTY accidents
## <dbl> <chr> <int>
## 1 2 KIAMBU 2
## 2 2 MACHAKOS 1
## 3 2 MURANGA 2
## 4 2 NAIROBI 2
## 5 3 KAJIADO 3
## 6 3 KIAMBU 18
## 7 3 MACHAKOS 8
## 8 3 MURANGA 2
## 9 3 NAIROBI 18
## 10 4 KIAMBU 1
## # … with 32 more rows
#plot
ggplot(accidents_by_month_county, aes(x = Month, y = accidents, fill = COUNTY)) +
geom_bar(stat = "identity", position = "dodge") +
xlab("Month") +
ylab("Number of Accidents") +
geom_text(aes(label=accidents), position=position_dodge(width=0.9), vjust=-0.5) +
ggtitle("Accidents by Month and County")+
scale_x_discrete(limits = unique(accidents_by_month_county$Month))
## Warning: Continuous limits supplied to discrete scale.
## ℹ Did you mean `limits = factor(...)` or `scale_*_continuous()`?