#INTRODUCTION: It consist of exploratory analysis of HW01 data.
options(repos=c(CRAN= “https://cran.rstudio.com”))
#Install the packages
install.packages(“tidyverse”)
install.packages(“readxl”)
install.packages(“ggplot2”)
install.packages(“reshape2”)
install.packages(“dplyr”)
#load the library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggplot2)
library(readxl)
#Loading data
data<-read_excel("C:/Users/hp/Downloads/HW1_Data.xlsx")
head(data)
## # A tibble: 6 × 19
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Clear Dark, not ligh… 2 lane, 2 w… Straight, l… 1 Marked lanes
## 2 Clear Dark, not ligh… 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 3 Clear Daylight 2 lane, 2 w… Straight, l… 1 Marked lanes
## 4 Clear Daylight 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 5 Clear Dark, not ligh… 2 lane, 2 w… Straight, g… 1 None
## 6 Clear Daylight Unknown Straight, l… 1 None
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## # FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## # Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## # Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## # Prsn_Injry_Sev_ID <chr>
#looking for NA data
sum(is.na(data))
## [1] 0
#Summary
summary(data)
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID
## Length:1295 Length:1295 Length:1295 Length:1295
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## SurfDry Traffic_Cntl_ID Harm_Evnt_ID Intrsct_Relat_ID
## Min. :0.0000 Length:1295 Length:1295 Length:1295
## 1st Qu.:1.0000 Class :character Class :character Class :character
## Median :1.0000 Mode :character Mode :character Mode :character
## Mean :0.9143
## 3rd Qu.:1.0000
## Max. :1.0000
## FHE_Collsn_ID Road_Part_Adj_ID Road_Cls_ID Pop_Group_ID
## Length:1295 Length:1295 Length:1295 Length:1295
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Crash_Speed_LimitCat Veh_Body_Styl_ID Prsn_Ethnicity_ID GenMale
## Length:1295 Length:1295 Length:1295 Min. :0.0000
## Class :character Class :character Class :character 1st Qu.:1.0000
## Mode :character Mode :character Mode :character Median :1.0000
## Mean :0.8842
## 3rd Qu.:1.0000
## Max. :1.0000
## TrafVol Prsn_Age Prsn_Injry_Sev_ID
## Min. : 215 Length:1295 Length:1295
## 1st Qu.: 6765 Class :character Class :character
## Median :14170 Mode :character Mode :character
## Mean :14417
## 3rd Qu.:22010
## Max. :28970
#Head operations
#looking values of data from weather condition to surface condition
head(select(data,Wthr_Cond_ID:SurfDry))
## # A tibble: 6 × 5
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID SurfDry
## <chr> <chr> <chr> <chr> <dbl>
## 1 Clear Dark, not lighted 2 lane, 2 way Straight, level 1
## 2 Clear Dark, not lighted 2 lane, 2 way Straight, level 1
## 3 Clear Daylight 2 lane, 2 way Straight, level 1
## 4 Clear Daylight 2 lane, 2 way Straight, level 1
## 5 Clear Dark, not lighted 2 lane, 2 way Straight, grade 1
## 6 Clear Daylight Unknown Straight, level 1
#Showing all columns except road type and road alignment
head(select(data, -(Road_Type_ID:Road_Algn_ID)))
## # A tibble: 6 × 17
## Wthr_Cond_ID Light_Cond_ID SurfDry Traffic_Cntl_ID Harm_Evnt_ID
## <chr> <chr> <dbl> <chr> <chr>
## 1 Clear Dark, not lighted 1 Marked lanes Motor vehicle in…
## 2 Clear Dark, not lighted 1 Center stripe/divider Motor vehicle in…
## 3 Clear Daylight 1 Marked lanes Motor vehicle in…
## 4 Clear Daylight 1 Center stripe/divider Fixed object
## 5 Clear Dark, not lighted 1 None Motor vehicle in…
## 6 Clear Daylight 1 None Motor vehicle in…
## # ℹ 12 more variables: Intrsct_Relat_ID <chr>, FHE_Collsn_ID <chr>,
## # Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>, Pop_Group_ID <chr>,
## # Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## # Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## # Prsn_Injry_Sev_ID <chr>
# Showing data for Gender 1
head(filter(data, GenMale ==1))
## # A tibble: 6 × 19
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Clear Dark, not ligh… 2 lane, 2 w… Straight, l… 1 Marked lanes
## 2 Clear Dark, not ligh… 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 3 Clear Daylight 2 lane, 2 w… Straight, l… 1 Marked lanes
## 4 Clear Daylight 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 5 Clear Dark, not ligh… 2 lane, 2 w… Straight, g… 1 None
## 6 Clear Daylight Unknown Straight, l… 1 None
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## # FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## # Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## # Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## # Prsn_Injry_Sev_ID <chr>
# Arranging data in descending order based on person age
head(data <- arrange(data, desc(Prsn_Age)))
## # A tibble: 6 × 19
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Clear Daylight 2 lane, 2 way Straight, l… 1 Marked lanes
## 2 Cloudy Daylight 2 lane, 2 way Straight, l… 1 Marked lanes
## 3 Clear Daylight Unknown Straight, l… 1 None
## 4 Clear Daylight 2 lane, 2 way Straight, g… 1 Center stripe/…
## 5 Rain Daylight Unknown Straight, l… 0 None
## 6 Clear Daylight 4 or more lan… Straight, l… 1 Marked lanes
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## # FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## # Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## # Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## # Prsn_Injry_Sev_ID <chr>
#Rename variable name
data <- rename(data, Surfacedry = "SurfDry")
data
## # A tibble: 1,295 × 19
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID Surfacedry
## <chr> <chr> <chr> <chr> <dbl>
## 1 Clear Daylight 2 lane, 2 way Straight, lev… 1
## 2 Cloudy Daylight 2 lane, 2 way Straight, lev… 1
## 3 Clear Daylight Unknown Straight, lev… 1
## 4 Clear Daylight 2 lane, 2 way Straight, gra… 1
## 5 Rain Daylight Unknown Straight, lev… 0
## 6 Clear Daylight 4 or more lanes, divided Straight, lev… 1
## 7 Clear Daylight 2 lane, 2 way Straight, lev… 1
## 8 Clear Daylight 2 lane, 2 way Straight, gra… 1
## 9 Clear Daylight 4 or more lanes, divided Straight, gra… 1
## 10 Cloudy Daylight Unknown Straight, lev… 1
## # ℹ 1,285 more rows
## # ℹ 14 more variables: Traffic_Cntl_ID <chr>, Harm_Evnt_ID <chr>,
## # Intrsct_Relat_ID <chr>, FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>,
## # Road_Cls_ID <chr>, Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>,
## # Veh_Body_Styl_ID <chr>, Prsn_Ethnicity_ID <chr>, GenMale <dbl>,
## # TrafVol <dbl>, Prsn_Age <chr>, Prsn_Injry_Sev_ID <chr>
data <- mutate(data, ScaledTrafficVolume=scale(TrafVol))
data
## # A tibble: 1,295 × 20
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID Surfacedry
## <chr> <chr> <chr> <chr> <dbl>
## 1 Clear Daylight 2 lane, 2 way Straight, lev… 1
## 2 Cloudy Daylight 2 lane, 2 way Straight, lev… 1
## 3 Clear Daylight Unknown Straight, lev… 1
## 4 Clear Daylight 2 lane, 2 way Straight, gra… 1
## 5 Rain Daylight Unknown Straight, lev… 0
## 6 Clear Daylight 4 or more lanes, divided Straight, lev… 1
## 7 Clear Daylight 2 lane, 2 way Straight, lev… 1
## 8 Clear Daylight 2 lane, 2 way Straight, gra… 1
## 9 Clear Daylight 4 or more lanes, divided Straight, gra… 1
## 10 Cloudy Daylight Unknown Straight, lev… 1
## # ℹ 1,285 more rows
## # ℹ 15 more variables: Traffic_Cntl_ID <chr>, Harm_Evnt_ID <chr>,
## # Intrsct_Relat_ID <chr>, FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>,
## # Road_Cls_ID <chr>, Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>,
## # Veh_Body_Styl_ID <chr>, Prsn_Ethnicity_ID <chr>, GenMale <dbl>,
## # TrafVol <dbl>, Prsn_Age <chr>, Prsn_Injry_Sev_ID <chr>,
## # ScaledTrafficVolume <dbl[,1]>
#Creating new dataset newdata
newdata <- transmute(data, ScaledTrafficVolume = scale(TrafVol),Personage=Prsn_Age)
newdata
## # A tibble: 1,295 × 2
## ScaledTrafficVolume[,1] Personage
## <dbl> <chr>
## 1 -0.343 Other
## 2 0.754 Other
## 3 -0.919 Other
## 4 -0.0401 Other
## 5 1.10 Other
## 6 1.67 Other
## 7 -0.969 Other
## 8 -0.672 Other
## 9 1.31 Other
## 10 1.17 Other
## # ℹ 1,285 more rows
data %>% mutate( Traffictype= factor(TrafVol>5000,labels=c("low","high")))
## # A tibble: 1,295 × 21
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID Surfacedry
## <chr> <chr> <chr> <chr> <dbl>
## 1 Clear Daylight 2 lane, 2 way Straight, lev… 1
## 2 Cloudy Daylight 2 lane, 2 way Straight, lev… 1
## 3 Clear Daylight Unknown Straight, lev… 1
## 4 Clear Daylight 2 lane, 2 way Straight, gra… 1
## 5 Rain Daylight Unknown Straight, lev… 0
## 6 Clear Daylight 4 or more lanes, divided Straight, lev… 1
## 7 Clear Daylight 2 lane, 2 way Straight, lev… 1
## 8 Clear Daylight 2 lane, 2 way Straight, gra… 1
## 9 Clear Daylight 4 or more lanes, divided Straight, gra… 1
## 10 Cloudy Daylight Unknown Straight, lev… 1
## # ℹ 1,285 more rows
## # ℹ 16 more variables: Traffic_Cntl_ID <chr>, Harm_Evnt_ID <chr>,
## # Intrsct_Relat_ID <chr>, FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>,
## # Road_Cls_ID <chr>, Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>,
## # Veh_Body_Styl_ID <chr>, Prsn_Ethnicity_ID <chr>, GenMale <dbl>,
## # TrafVol <dbl>, Prsn_Age <chr>, Prsn_Injry_Sev_ID <chr>,
## # ScaledTrafficVolume <dbl[,1]>, Traffictype <fct>
data
## # A tibble: 1,295 × 20
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID Surfacedry
## <chr> <chr> <chr> <chr> <dbl>
## 1 Clear Daylight 2 lane, 2 way Straight, lev… 1
## 2 Cloudy Daylight 2 lane, 2 way Straight, lev… 1
## 3 Clear Daylight Unknown Straight, lev… 1
## 4 Clear Daylight 2 lane, 2 way Straight, gra… 1
## 5 Rain Daylight Unknown Straight, lev… 0
## 6 Clear Daylight 4 or more lanes, divided Straight, lev… 1
## 7 Clear Daylight 2 lane, 2 way Straight, lev… 1
## 8 Clear Daylight 2 lane, 2 way Straight, gra… 1
## 9 Clear Daylight 4 or more lanes, divided Straight, gra… 1
## 10 Cloudy Daylight Unknown Straight, lev… 1
## # ℹ 1,285 more rows
## # ℹ 15 more variables: Traffic_Cntl_ID <chr>, Harm_Evnt_ID <chr>,
## # Intrsct_Relat_ID <chr>, FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>,
## # Road_Cls_ID <chr>, Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>,
## # Veh_Body_Styl_ID <chr>, Prsn_Ethnicity_ID <chr>, GenMale <dbl>,
## # TrafVol <dbl>, Prsn_Age <chr>, Prsn_Injry_Sev_ID <chr>,
## # ScaledTrafficVolume <dbl[,1]>
#Grouping by personage
highlow <-group_by(newdata,Personage)
highlow
## # A tibble: 1,295 × 2
## # Groups: Personage [5]
## ScaledTrafficVolume[,1] Personage
## <dbl> <chr>
## 1 -0.343 Other
## 2 0.754 Other
## 3 -0.919 Other
## 4 -0.0401 Other
## 5 1.10 Other
## 6 1.67 Other
## 7 -0.969 Other
## 8 -0.672 Other
## 9 1.31 Other
## 10 1.17 Other
## # ℹ 1,285 more rows
newdata=newdata %>% mutate(Trafficnature=factor(ScaledTrafficVolume>0, labels=c("low","high"))) %>% tail()
newdata
## # A tibble: 6 × 3
## ScaledTrafficVolume[,1] Personage Trafficnature
## <dbl> <chr> <fct>
## 1 -0.102 15-24 years low
## 2 -1.09 15-24 years low
## 3 0.299 15-24 years high
## 4 -1.11 15-24 years low
## 5 1.05 15-24 years high
## 6 1.52 15-24 years high
wideData <- pivot_wider(data,names_from = "Road_Type_ID",values_from ="Surfacedry")
wideData
## # A tibble: 1,295 × 23
## Wthr_Cond_ID Light_Cond_ID Road_Algn_ID Traffic_Cntl_ID Harm_Evnt_ID
## <chr> <chr> <chr> <chr> <chr>
## 1 Clear Daylight Straight, level Marked lanes Motor vehic…
## 2 Cloudy Daylight Straight, level Marked lanes Fixed object
## 3 Clear Daylight Straight, level None Motor vehic…
## 4 Clear Daylight Straight, grade Center stripe/divider Motor vehic…
## 5 Rain Daylight Straight, level None Fixed object
## 6 Clear Daylight Straight, level Marked lanes Motor vehic…
## 7 Clear Daylight Straight, level No passing zone Motor vehic…
## 8 Clear Daylight Straight, grade Marked lanes Motor vehic…
## 9 Clear Daylight Straight, grade Center stripe/divider Motor vehic…
## 10 Cloudy Daylight Straight, level None Parked car
## # ℹ 1,285 more rows
## # ℹ 18 more variables: Intrsct_Relat_ID <chr>, FHE_Collsn_ID <chr>,
## # Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>, Pop_Group_ID <chr>,
## # Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## # Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## # Prsn_Injry_Sev_ID <chr>, ScaledTrafficVolume <dbl[,1]>,
## # `2 lane, 2 way` <dbl>, Unknown <dbl>, `4 or more lanes, divided` <dbl>, …
data<-data%>%mutate(Gender=factor(GenMale==1,labels=c("Female","Male")))
ggplot(data, aes(x=Gender)) +geom_bar(,binwidth=1,fill="lightgreen") +theme_minimal() +labs(title="Crashes by Gender", x="Gender", y="Count")
## Warning in geom_bar(, binwidth = 1, fill = "lightgreen"): Ignoring unknown
## parameters: `binwidth`
#Barplot for weather condition by injury severity
ggplot(data, aes(x=Wthr_Cond_ID, fill=Prsn_Injry_Sev_ID)) +
geom_bar(position="stack") +
theme_minimal() +
labs(title="Injury Severity by Weather Condition", x="Weather Condition", y="Count", fill="Injury Severity")
#Boxplot for Road type vs traffic volume
ggplot(data, aes(x=Road_Type_ID, y=TrafVol)) +
geom_boxplot(fill="lightgreen") +
theme_bw() +
labs(title="Traffic Volume by Road Type", x="Road Type", y="Traffic Volume")
#Boxplot for Crash speed limit by weather condition
ggplot(data, aes(x=Wthr_Cond_ID, y=as.numeric(gsub("[^0-9]", "", Crash_Speed_LimitCat)))) + geom_boxplot(fill="lightpink", color="black") + theme_minimal() + labs(title="Crash Speed Limit by Weather Condition", x="Weather Condition", y="Crash Speed Limit")
## Warning: Removed 62 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
#Scatterplot for traffic volume vs crash speed limit
data$Crash_Speed_LimitCat <- as.numeric(gsub("[^0-9]", "", data$Crash_Speed_LimitCat))
ggplot(data, aes(x=Crash_Speed_LimitCat, y=TrafVol)) +
geom_point(color="blue") +
theme_minimal() +labs(title="Traffic Volume vs Crash Speed Limit", x="Crash Speed Limit", y="Traffic Volume")
## Warning: Removed 62 rows containing missing values or values outside the scale range
## (`geom_point()`).
#heatmap for road type vs injury severity
library(reshape2)
heatmap_file <- table(data$Road_Type_ID,data$Prsn_Injry_Sev_ID)
heatmap_dataframe <- melt(heatmap_file)
ggplot(heatmap_dataframe, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() + theme_bw() + labs(title="Heatmap of Road Type vs Injury Severity", x="Road Type", y="Injury Severity")
# Heatmap for road alignment vs injury severity
heatmap_file2 <- table(data$Road_Algn_ID,data$Prsn_Injry_Sev_ID)
heatmap_dataframe2 <- melt(heatmap_file2)
ggplot(heatmap_dataframe2, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() + theme_bw() + labs(title="Heatmap of Road alignment vs injury severity", x="Road Alignment", y="Injury Severity")
#histogram for Traffic volume
ggplot(data,aes(x=TrafVol)) + geom_histogram(binwidth=400,fill=rgb(0.5,0.4,0.6),color="blue")+theme_bw()+labs(title="Histogram for traffic volume",x="Traffic volume",y="frequency")
#histogram for Person age
ggplot(data,aes(x=Prsn_Age)) + geom_histogram(stat="count",fill=rgb(0.3,0.8,0.2),color="blue")+theme_bw()+labs(title="Histogram for person age",x="Person age",y="frequency")
## Warning in geom_histogram(stat = "count", fill = rgb(0.3, 0.8, 0.2), color =
## "blue"): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
Comments: There is a high correlation between Injury severity O and straight and level road alignment and Injury severity O and 2 lane , 2 way road as well as unknown road type.There is high injury severity of all types(O,KA,BC) in clear weather condition. The people from age group 25-54 years are prone to traffic crashes realted injuries. Also, man have the heighest crashes number.
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.