#INTRODUCTION: It consist of exploratory analysis of HW01 data.

options(repos=c(CRAN= “https://cran.rstudio.com”))

#Install the packages

install.packages(“tidyverse”)

install.packages(“readxl”)

install.packages(“ggplot2”)

install.packages(“reshape2”)

install.packages(“dplyr”)

#load the library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(ggplot2)
library(readxl)
#Loading data
data<-read_excel("C:/Users/hp/Downloads/HW1_Data.xlsx")
head(data)
## # A tibble: 6 × 19
##   Wthr_Cond_ID Light_Cond_ID   Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
##   <chr>        <chr>           <chr>        <chr>          <dbl> <chr>          
## 1 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Marked lanes   
## 2 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Center stripe/…
## 3 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Marked lanes   
## 4 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Center stripe/…
## 5 Clear        Dark, not ligh… 2 lane, 2 w… Straight, g…       1 None           
## 6 Clear        Daylight        Unknown      Straight, l…       1 None           
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## #   FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## #   Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## #   Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## #   Prsn_Injry_Sev_ID <chr>
#looking for NA data
sum(is.na(data))
## [1] 0

#Summary

summary(data)
##  Wthr_Cond_ID       Light_Cond_ID      Road_Type_ID       Road_Algn_ID      
##  Length:1295        Length:1295        Length:1295        Length:1295       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     SurfDry       Traffic_Cntl_ID    Harm_Evnt_ID       Intrsct_Relat_ID  
##  Min.   :0.0000   Length:1295        Length:1295        Length:1295       
##  1st Qu.:1.0000   Class :character   Class :character   Class :character  
##  Median :1.0000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.9143                                                           
##  3rd Qu.:1.0000                                                           
##  Max.   :1.0000                                                           
##  FHE_Collsn_ID      Road_Part_Adj_ID   Road_Cls_ID        Pop_Group_ID      
##  Length:1295        Length:1295        Length:1295        Length:1295       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Crash_Speed_LimitCat Veh_Body_Styl_ID   Prsn_Ethnicity_ID     GenMale      
##  Length:1295          Length:1295        Length:1295        Min.   :0.0000  
##  Class :character     Class :character   Class :character   1st Qu.:1.0000  
##  Mode  :character     Mode  :character   Mode  :character   Median :1.0000  
##                                                             Mean   :0.8842  
##                                                             3rd Qu.:1.0000  
##                                                             Max.   :1.0000  
##     TrafVol        Prsn_Age         Prsn_Injry_Sev_ID 
##  Min.   :  215   Length:1295        Length:1295       
##  1st Qu.: 6765   Class :character   Class :character  
##  Median :14170   Mode  :character   Mode  :character  
##  Mean   :14417                                        
##  3rd Qu.:22010                                        
##  Max.   :28970

#Head operations

#looking values of data from weather condition to surface condition
head(select(data,Wthr_Cond_ID:SurfDry))
## # A tibble: 6 × 5
##   Wthr_Cond_ID Light_Cond_ID     Road_Type_ID  Road_Algn_ID    SurfDry
##   <chr>        <chr>             <chr>         <chr>             <dbl>
## 1 Clear        Dark, not lighted 2 lane, 2 way Straight, level       1
## 2 Clear        Dark, not lighted 2 lane, 2 way Straight, level       1
## 3 Clear        Daylight          2 lane, 2 way Straight, level       1
## 4 Clear        Daylight          2 lane, 2 way Straight, level       1
## 5 Clear        Dark, not lighted 2 lane, 2 way Straight, grade       1
## 6 Clear        Daylight          Unknown       Straight, level       1
#Showing all columns except road type and road alignment
head(select(data, -(Road_Type_ID:Road_Algn_ID)))
## # A tibble: 6 × 17
##   Wthr_Cond_ID Light_Cond_ID     SurfDry Traffic_Cntl_ID       Harm_Evnt_ID     
##   <chr>        <chr>               <dbl> <chr>                 <chr>            
## 1 Clear        Dark, not lighted       1 Marked lanes          Motor vehicle in…
## 2 Clear        Dark, not lighted       1 Center stripe/divider Motor vehicle in…
## 3 Clear        Daylight                1 Marked lanes          Motor vehicle in…
## 4 Clear        Daylight                1 Center stripe/divider Fixed object     
## 5 Clear        Dark, not lighted       1 None                  Motor vehicle in…
## 6 Clear        Daylight                1 None                  Motor vehicle in…
## # ℹ 12 more variables: Intrsct_Relat_ID <chr>, FHE_Collsn_ID <chr>,
## #   Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>, Pop_Group_ID <chr>,
## #   Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## #   Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## #   Prsn_Injry_Sev_ID <chr>
# Showing data for Gender 1 
head(filter(data, GenMale ==1))
## # A tibble: 6 × 19
##   Wthr_Cond_ID Light_Cond_ID   Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
##   <chr>        <chr>           <chr>        <chr>          <dbl> <chr>          
## 1 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Marked lanes   
## 2 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Center stripe/…
## 3 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Marked lanes   
## 4 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Center stripe/…
## 5 Clear        Dark, not ligh… 2 lane, 2 w… Straight, g…       1 None           
## 6 Clear        Daylight        Unknown      Straight, l…       1 None           
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## #   FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## #   Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## #   Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## #   Prsn_Injry_Sev_ID <chr>
# Arranging data in descending order based on person age
head(data <- arrange(data, desc(Prsn_Age)))
## # A tibble: 6 × 19
##   Wthr_Cond_ID Light_Cond_ID Road_Type_ID   Road_Algn_ID SurfDry Traffic_Cntl_ID
##   <chr>        <chr>         <chr>          <chr>          <dbl> <chr>          
## 1 Clear        Daylight      2 lane, 2 way  Straight, l…       1 Marked lanes   
## 2 Cloudy       Daylight      2 lane, 2 way  Straight, l…       1 Marked lanes   
## 3 Clear        Daylight      Unknown        Straight, l…       1 None           
## 4 Clear        Daylight      2 lane, 2 way  Straight, g…       1 Center stripe/…
## 5 Rain         Daylight      Unknown        Straight, l…       0 None           
## 6 Clear        Daylight      4 or more lan… Straight, l…       1 Marked lanes   
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## #   FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## #   Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## #   Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## #   Prsn_Injry_Sev_ID <chr>
#Rename variable name
data <- rename(data, Surfacedry = "SurfDry")
data
## # A tibble: 1,295 × 19
##    Wthr_Cond_ID Light_Cond_ID Road_Type_ID             Road_Algn_ID   Surfacedry
##    <chr>        <chr>         <chr>                    <chr>               <dbl>
##  1 Clear        Daylight      2 lane, 2 way            Straight, lev…          1
##  2 Cloudy       Daylight      2 lane, 2 way            Straight, lev…          1
##  3 Clear        Daylight      Unknown                  Straight, lev…          1
##  4 Clear        Daylight      2 lane, 2 way            Straight, gra…          1
##  5 Rain         Daylight      Unknown                  Straight, lev…          0
##  6 Clear        Daylight      4 or more lanes, divided Straight, lev…          1
##  7 Clear        Daylight      2 lane, 2 way            Straight, lev…          1
##  8 Clear        Daylight      2 lane, 2 way            Straight, gra…          1
##  9 Clear        Daylight      4 or more lanes, divided Straight, gra…          1
## 10 Cloudy       Daylight      Unknown                  Straight, lev…          1
## # ℹ 1,285 more rows
## # ℹ 14 more variables: Traffic_Cntl_ID <chr>, Harm_Evnt_ID <chr>,
## #   Intrsct_Relat_ID <chr>, FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>,
## #   Road_Cls_ID <chr>, Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>,
## #   Veh_Body_Styl_ID <chr>, Prsn_Ethnicity_ID <chr>, GenMale <dbl>,
## #   TrafVol <dbl>, Prsn_Age <chr>, Prsn_Injry_Sev_ID <chr>

Column addition using mutate

data <- mutate(data, ScaledTrafficVolume=scale(TrafVol))
data
## # A tibble: 1,295 × 20
##    Wthr_Cond_ID Light_Cond_ID Road_Type_ID             Road_Algn_ID   Surfacedry
##    <chr>        <chr>         <chr>                    <chr>               <dbl>
##  1 Clear        Daylight      2 lane, 2 way            Straight, lev…          1
##  2 Cloudy       Daylight      2 lane, 2 way            Straight, lev…          1
##  3 Clear        Daylight      Unknown                  Straight, lev…          1
##  4 Clear        Daylight      2 lane, 2 way            Straight, gra…          1
##  5 Rain         Daylight      Unknown                  Straight, lev…          0
##  6 Clear        Daylight      4 or more lanes, divided Straight, lev…          1
##  7 Clear        Daylight      2 lane, 2 way            Straight, lev…          1
##  8 Clear        Daylight      2 lane, 2 way            Straight, gra…          1
##  9 Clear        Daylight      4 or more lanes, divided Straight, gra…          1
## 10 Cloudy       Daylight      Unknown                  Straight, lev…          1
## # ℹ 1,285 more rows
## # ℹ 15 more variables: Traffic_Cntl_ID <chr>, Harm_Evnt_ID <chr>,
## #   Intrsct_Relat_ID <chr>, FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>,
## #   Road_Cls_ID <chr>, Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>,
## #   Veh_Body_Styl_ID <chr>, Prsn_Ethnicity_ID <chr>, GenMale <dbl>,
## #   TrafVol <dbl>, Prsn_Age <chr>, Prsn_Injry_Sev_ID <chr>,
## #   ScaledTrafficVolume <dbl[,1]>

#Creating new dataset newdata

newdata <- transmute(data, ScaledTrafficVolume = scale(TrafVol),Personage=Prsn_Age)
newdata
## # A tibble: 1,295 × 2
##    ScaledTrafficVolume[,1] Personage
##                      <dbl> <chr>    
##  1                 -0.343  Other    
##  2                  0.754  Other    
##  3                 -0.919  Other    
##  4                 -0.0401 Other    
##  5                  1.10   Other    
##  6                  1.67   Other    
##  7                 -0.969  Other    
##  8                 -0.672  Other    
##  9                  1.31   Other    
## 10                  1.17   Other    
## # ℹ 1,285 more rows

Piping example

data %>% mutate( Traffictype= factor(TrafVol>5000,labels=c("low","high")))
## # A tibble: 1,295 × 21
##    Wthr_Cond_ID Light_Cond_ID Road_Type_ID             Road_Algn_ID   Surfacedry
##    <chr>        <chr>         <chr>                    <chr>               <dbl>
##  1 Clear        Daylight      2 lane, 2 way            Straight, lev…          1
##  2 Cloudy       Daylight      2 lane, 2 way            Straight, lev…          1
##  3 Clear        Daylight      Unknown                  Straight, lev…          1
##  4 Clear        Daylight      2 lane, 2 way            Straight, gra…          1
##  5 Rain         Daylight      Unknown                  Straight, lev…          0
##  6 Clear        Daylight      4 or more lanes, divided Straight, lev…          1
##  7 Clear        Daylight      2 lane, 2 way            Straight, lev…          1
##  8 Clear        Daylight      2 lane, 2 way            Straight, gra…          1
##  9 Clear        Daylight      4 or more lanes, divided Straight, gra…          1
## 10 Cloudy       Daylight      Unknown                  Straight, lev…          1
## # ℹ 1,285 more rows
## # ℹ 16 more variables: Traffic_Cntl_ID <chr>, Harm_Evnt_ID <chr>,
## #   Intrsct_Relat_ID <chr>, FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>,
## #   Road_Cls_ID <chr>, Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>,
## #   Veh_Body_Styl_ID <chr>, Prsn_Ethnicity_ID <chr>, GenMale <dbl>,
## #   TrafVol <dbl>, Prsn_Age <chr>, Prsn_Injry_Sev_ID <chr>,
## #   ScaledTrafficVolume <dbl[,1]>, Traffictype <fct>
data
## # A tibble: 1,295 × 20
##    Wthr_Cond_ID Light_Cond_ID Road_Type_ID             Road_Algn_ID   Surfacedry
##    <chr>        <chr>         <chr>                    <chr>               <dbl>
##  1 Clear        Daylight      2 lane, 2 way            Straight, lev…          1
##  2 Cloudy       Daylight      2 lane, 2 way            Straight, lev…          1
##  3 Clear        Daylight      Unknown                  Straight, lev…          1
##  4 Clear        Daylight      2 lane, 2 way            Straight, gra…          1
##  5 Rain         Daylight      Unknown                  Straight, lev…          0
##  6 Clear        Daylight      4 or more lanes, divided Straight, lev…          1
##  7 Clear        Daylight      2 lane, 2 way            Straight, lev…          1
##  8 Clear        Daylight      2 lane, 2 way            Straight, gra…          1
##  9 Clear        Daylight      4 or more lanes, divided Straight, gra…          1
## 10 Cloudy       Daylight      Unknown                  Straight, lev…          1
## # ℹ 1,285 more rows
## # ℹ 15 more variables: Traffic_Cntl_ID <chr>, Harm_Evnt_ID <chr>,
## #   Intrsct_Relat_ID <chr>, FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>,
## #   Road_Cls_ID <chr>, Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>,
## #   Veh_Body_Styl_ID <chr>, Prsn_Ethnicity_ID <chr>, GenMale <dbl>,
## #   TrafVol <dbl>, Prsn_Age <chr>, Prsn_Injry_Sev_ID <chr>,
## #   ScaledTrafficVolume <dbl[,1]>

#Grouping by personage

highlow <-group_by(newdata,Personage)
highlow
## # A tibble: 1,295 × 2
## # Groups:   Personage [5]
##    ScaledTrafficVolume[,1] Personage
##                      <dbl> <chr>    
##  1                 -0.343  Other    
##  2                  0.754  Other    
##  3                 -0.919  Other    
##  4                 -0.0401 Other    
##  5                  1.10   Other    
##  6                  1.67   Other    
##  7                 -0.969  Other    
##  8                 -0.672  Other    
##  9                  1.31   Other    
## 10                  1.17   Other    
## # ℹ 1,285 more rows

Piping for newdata

newdata=newdata %>% mutate(Trafficnature=factor(ScaledTrafficVolume>0, labels=c("low","high"))) %>% tail()
newdata
## # A tibble: 6 × 3
##   ScaledTrafficVolume[,1] Personage   Trafficnature
##                     <dbl> <chr>       <fct>        
## 1                  -0.102 15-24 years low          
## 2                  -1.09  15-24 years low          
## 3                   0.299 15-24 years high         
## 4                  -1.11  15-24 years low          
## 5                   1.05  15-24 years high         
## 6                   1.52  15-24 years high

Pivot wider example

wideData <-  pivot_wider(data,names_from = "Road_Type_ID",values_from ="Surfacedry")
wideData
## # A tibble: 1,295 × 23
##    Wthr_Cond_ID Light_Cond_ID Road_Algn_ID    Traffic_Cntl_ID       Harm_Evnt_ID
##    <chr>        <chr>         <chr>           <chr>                 <chr>       
##  1 Clear        Daylight      Straight, level Marked lanes          Motor vehic…
##  2 Cloudy       Daylight      Straight, level Marked lanes          Fixed object
##  3 Clear        Daylight      Straight, level None                  Motor vehic…
##  4 Clear        Daylight      Straight, grade Center stripe/divider Motor vehic…
##  5 Rain         Daylight      Straight, level None                  Fixed object
##  6 Clear        Daylight      Straight, level Marked lanes          Motor vehic…
##  7 Clear        Daylight      Straight, level No passing zone       Motor vehic…
##  8 Clear        Daylight      Straight, grade Marked lanes          Motor vehic…
##  9 Clear        Daylight      Straight, grade Center stripe/divider Motor vehic…
## 10 Cloudy       Daylight      Straight, level None                  Parked car  
## # ℹ 1,285 more rows
## # ℹ 18 more variables: Intrsct_Relat_ID <chr>, FHE_Collsn_ID <chr>,
## #   Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>, Pop_Group_ID <chr>,
## #   Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## #   Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## #   Prsn_Injry_Sev_ID <chr>, ScaledTrafficVolume <dbl[,1]>,
## #   `2 lane, 2 way` <dbl>, Unknown <dbl>, `4 or more lanes, divided` <dbl>, …

PLOTS

Barplot for weather condition

data<-data%>%mutate(Gender=factor(GenMale==1,labels=c("Female","Male")))
ggplot(data, aes(x=Gender)) +geom_bar(,binwidth=1,fill="lightgreen") +theme_minimal() +labs(title="Crashes by Gender", x="Gender", y="Count")
## Warning in geom_bar(, binwidth = 1, fill = "lightgreen"): Ignoring unknown
## parameters: `binwidth`

#Barplot for weather condition by injury severity
ggplot(data, aes(x=Wthr_Cond_ID, fill=Prsn_Injry_Sev_ID)) +
    geom_bar(position="stack") +
    theme_minimal() +
 labs(title="Injury Severity by Weather Condition", x="Weather Condition", y="Count", fill="Injury Severity")

#Boxplot for Road type vs traffic volume
ggplot(data, aes(x=Road_Type_ID, y=TrafVol)) +
    geom_boxplot(fill="lightgreen") +
    theme_bw() +
    labs(title="Traffic Volume by Road Type", x="Road Type", y="Traffic Volume")

#Boxplot for Crash speed limit by weather condition
ggplot(data, aes(x=Wthr_Cond_ID, y=as.numeric(gsub("[^0-9]", "", Crash_Speed_LimitCat)))) + geom_boxplot(fill="lightpink", color="black") + theme_minimal() + labs(title="Crash Speed Limit by Weather Condition", x="Weather Condition", y="Crash Speed Limit")
## Warning: Removed 62 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

#Scatterplot for traffic volume vs crash speed limit
data$Crash_Speed_LimitCat <- as.numeric(gsub("[^0-9]", "", data$Crash_Speed_LimitCat))

ggplot(data, aes(x=Crash_Speed_LimitCat, y=TrafVol)) +
geom_point(color="blue") +
theme_minimal() +labs(title="Traffic Volume vs Crash Speed Limit", x="Crash Speed Limit", y="Traffic Volume")
## Warning: Removed 62 rows containing missing values or values outside the scale range
## (`geom_point()`).

#heatmap for road type vs injury severity
library(reshape2)
heatmap_file <- table(data$Road_Type_ID,data$Prsn_Injry_Sev_ID)
 heatmap_dataframe <- melt(heatmap_file)
 ggplot(heatmap_dataframe, aes(x=Var1, y=Var2, fill=value)) +
 geom_tile() + theme_bw() + labs(title="Heatmap of Road Type vs Injury Severity", x="Road Type", y="Injury Severity")

 # Heatmap for road alignment vs injury severity
 heatmap_file2 <- table(data$Road_Algn_ID,data$Prsn_Injry_Sev_ID)
 heatmap_dataframe2 <- melt(heatmap_file2)
 ggplot(heatmap_dataframe2, aes(x=Var1, y=Var2, fill=value)) +
 geom_tile() + theme_bw() + labs(title="Heatmap of Road alignment vs injury severity", x="Road Alignment", y="Injury Severity")

#histogram for Traffic volume
ggplot(data,aes(x=TrafVol)) + geom_histogram(binwidth=400,fill=rgb(0.5,0.4,0.6),color="blue")+theme_bw()+labs(title="Histogram for traffic volume",x="Traffic volume",y="frequency")

#histogram for Person age
ggplot(data,aes(x=Prsn_Age)) + geom_histogram(stat="count",fill=rgb(0.3,0.8,0.2),color="blue")+theme_bw()+labs(title="Histogram for person age",x="Person age",y="frequency")
## Warning in geom_histogram(stat = "count", fill = rgb(0.3, 0.8, 0.2), color =
## "blue"): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`

Comments: There is a high correlation between Injury severity O and straight and level road alignment and Injury severity O and 2 lane , 2 way road as well as unknown road type.There is high injury severity of all types(O,KA,BC) in clear weather condition. The people from age group 25-54 years are prone to traffic crashes realted injuries. Also, man have the heighest crashes number.

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.