#Title: Homework of AI(Exploratory analysis)

#Install the packages

install.packages(“tidyverse)

install.packages(“dplyr)

install.packages(“readxl”)

install.packages(“ggplot2”)

install.package(“reshape2”)

R Markdown

options(repos=c(CRAN=“https://cran.rstudio.com”))

#loading the library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(ggplot2)
library(readxl)
#Loading data
testdata<-read_excel("C:/Users/rupes/Downloads/HW1_Data.xlsx")
head(testdata)
## # A tibble: 6 × 19
##   Wthr_Cond_ID Light_Cond_ID   Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
##   <chr>        <chr>           <chr>        <chr>          <dbl> <chr>          
## 1 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Marked lanes   
## 2 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Center stripe/…
## 3 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Marked lanes   
## 4 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Center stripe/…
## 5 Clear        Dark, not ligh… 2 lane, 2 w… Straight, g…       1 None           
## 6 Clear        Daylight        Unknown      Straight, l…       1 None           
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## #   FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## #   Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## #   Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## #   Prsn_Injry_Sev_ID <chr>

#searching NA data

sum(is.na(testdata))
## [1] 0

#Summary

summary(testdata)
##  Wthr_Cond_ID       Light_Cond_ID      Road_Type_ID       Road_Algn_ID      
##  Length:1295        Length:1295        Length:1295        Length:1295       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     SurfDry       Traffic_Cntl_ID    Harm_Evnt_ID       Intrsct_Relat_ID  
##  Min.   :0.0000   Length:1295        Length:1295        Length:1295       
##  1st Qu.:1.0000   Class :character   Class :character   Class :character  
##  Median :1.0000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.9143                                                           
##  3rd Qu.:1.0000                                                           
##  Max.   :1.0000                                                           
##  FHE_Collsn_ID      Road_Part_Adj_ID   Road_Cls_ID        Pop_Group_ID      
##  Length:1295        Length:1295        Length:1295        Length:1295       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Crash_Speed_LimitCat Veh_Body_Styl_ID   Prsn_Ethnicity_ID     GenMale      
##  Length:1295          Length:1295        Length:1295        Min.   :0.0000  
##  Class :character     Class :character   Class :character   1st Qu.:1.0000  
##  Mode  :character     Mode  :character   Mode  :character   Median :1.0000  
##                                                             Mean   :0.8842  
##                                                             3rd Qu.:1.0000  
##                                                             Max.   :1.0000  
##     TrafVol        Prsn_Age         Prsn_Injry_Sev_ID 
##  Min.   :  215   Length:1295        Length:1295       
##  1st Qu.: 6765   Class :character   Class :character  
##  Median :14170   Mode  :character   Mode  :character  
##  Mean   :14417                                        
##  3rd Qu.:22010                                        
##  Max.   :28970
str(testdata)
## tibble [1,295 × 19] (S3: tbl_df/tbl/data.frame)
##  $ Wthr_Cond_ID        : chr [1:1295] "Clear" "Clear" "Clear" "Clear" ...
##  $ Light_Cond_ID       : chr [1:1295] "Dark, not lighted" "Dark, not lighted" "Daylight" "Daylight" ...
##  $ Road_Type_ID        : chr [1:1295] "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" ...
##  $ Road_Algn_ID        : chr [1:1295] "Straight, level" "Straight, level" "Straight, level" "Straight, level" ...
##  $ SurfDry             : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Traffic_Cntl_ID     : chr [1:1295] "Marked lanes" "Center stripe/divider" "Marked lanes" "Center stripe/divider" ...
##  $ Harm_Evnt_ID        : chr [1:1295] "Motor vehicle in transport" "Motor vehicle in transport" "Motor vehicle in transport" "Fixed object" ...
##  $ Intrsct_Relat_ID    : chr [1:1295] "Non intersection" "Non intersection" "Intersection" "Non intersection" ...
##  $ FHE_Collsn_ID       : chr [1:1295] "Sd both going straight-rear end" "Sd both going straight-rear end" "Other" "Omv vehicle going straight" ...
##  $ Road_Part_Adj_ID    : chr [1:1295] "Main/proper lane" "Main/proper lane" "Main/proper lane" "Main/proper lane" ...
##  $ Road_Cls_ID         : chr [1:1295] "Farm to market" "Us & state highways" "Farm to market" "Us & state highways" ...
##  $ Pop_Group_ID        : chr [1:1295] "10,000 - 24,999 pop" "Rural" "Other" "Rural" ...
##  $ Crash_Speed_LimitCat: chr [1:1295] "30-40 mph" "65-70 mph" "45-60 mph" "65-70 mph" ...
##  $ Veh_Body_Styl_ID    : chr [1:1295] "Farm equipment" "Farm equipment" "Farm equipment" "Farm equipment" ...
##  $ Prsn_Ethnicity_ID   : chr [1:1295] "White" "White" "White" "White" ...
##  $ GenMale             : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
##  $ TrafVol             : num [1:1295] 7654 13770 11470 16972 413 ...
##  $ Prsn_Age            : chr [1:1295] "25-54 years" "25-54 years" "Other" "25-54 years" ...
##  $ Prsn_Injry_Sev_ID   : chr [1:1295] "O" "O" "O" "O" ...

#General operations for viewing dataset

#looking values of data from lighting condition to surface condition
head(select(testdata,Light_Cond_ID:SurfDry))
## # A tibble: 6 × 4
##   Light_Cond_ID     Road_Type_ID  Road_Algn_ID    SurfDry
##   <chr>             <chr>         <chr>             <dbl>
## 1 Dark, not lighted 2 lane, 2 way Straight, level       1
## 2 Dark, not lighted 2 lane, 2 way Straight, level       1
## 3 Daylight          2 lane, 2 way Straight, level       1
## 4 Daylight          2 lane, 2 way Straight, level       1
## 5 Dark, not lighted 2 lane, 2 way Straight, grade       1
## 6 Daylight          Unknown       Straight, level       1
# presenting data for Gender female
head(filter(testdata, GenMale ==0))
## # A tibble: 6 × 19
##   Wthr_Cond_ID Light_Cond_ID   Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
##   <chr>        <chr>           <chr>        <chr>          <dbl> <chr>          
## 1 Cloudy       Daylight        2 lane, 2 w… Straight, l…       1 Marked lanes   
## 2 Clear        Dark, not ligh… 2 lane, 2 w… Straight, g…       1 Marked lanes   
## 3 Clear        Daylight        2 lane, 2 w… Straight, g…       1 Center stripe/…
## 4 Rain         Daylight        Unknown      Straight, l…       0 None           
## 5 Cloudy       Daylight        Unknown      Straight, l…       1 None           
## 6 Clear        Daylight        Unknown      Curve, level       0 None           
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## #   FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## #   Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## #   Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## #   Prsn_Injry_Sev_ID <chr>
# Arranging data in ascending order based on person age
head(data <- arrange(testdata, Prsn_Age))
## # A tibble: 6 × 19
##   Wthr_Cond_ID Light_Cond_ID   Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
##   <chr>        <chr>           <chr>        <chr>          <dbl> <chr>          
## 1 Clear        Daylight        4 or more l… Curve, level       1 Marked lanes   
## 2 Clear        Daylight        2 lane, 2 w… Straight, g…       1 Other          
## 3 Cloudy       Dark, not ligh… Unknown      Straight, l…       1 None           
## 4 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Marked lanes   
## 5 Cloudy       Dark, not ligh… Unknown      Straight, l…       0 None           
## 6 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Center stripe/…
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## #   FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## #   Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## #   Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## #   Prsn_Injry_Sev_ID <chr>
#Variable name renaming
head(data <- rename(testdata, Harmevent = "Harm_Evnt_ID"))
## # A tibble: 6 × 19
##   Wthr_Cond_ID Light_Cond_ID   Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
##   <chr>        <chr>           <chr>        <chr>          <dbl> <chr>          
## 1 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Marked lanes   
## 2 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Center stripe/…
## 3 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Marked lanes   
## 4 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Center stripe/…
## 5 Clear        Dark, not ligh… 2 lane, 2 w… Straight, g…       1 None           
## 6 Clear        Daylight        Unknown      Straight, l…       1 None           
## # ℹ 13 more variables: Harmevent <chr>, Intrsct_Relat_ID <chr>,
## #   FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## #   Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## #   Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## #   Prsn_Injry_Sev_ID <chr>

#Creating new dataset using transmute

testdata2 <- transmute(testdata, ScaledTrafficVolume = scale(TrafVol),Roadalignment=Road_Algn_ID)
head(testdata2)
## # A tibble: 6 × 2
##   ScaledTrafficVolume[,1] Roadalignment  
##                     <dbl> <chr>          
## 1                 -0.787  Straight, level
## 2                 -0.0753 Straight, level
## 3                 -0.343  Straight, level
## 4                  0.297  Straight, level
## 5                 -1.63   Straight, grade
## 6                 -1.34   Straight, level

Piping function

testdata %>% mutate( Trafficnature= factor(TrafVol>7000,labels=c("low","high")))
## # A tibble: 1,295 × 20
##    Wthr_Cond_ID Light_Cond_ID  Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
##    <chr>        <chr>          <chr>        <chr>          <dbl> <chr>          
##  1 Clear        Dark, not lig… 2 lane, 2 w… Straight, l…       1 Marked lanes   
##  2 Clear        Dark, not lig… 2 lane, 2 w… Straight, l…       1 Center stripe/…
##  3 Clear        Daylight       2 lane, 2 w… Straight, l…       1 Marked lanes   
##  4 Clear        Daylight       2 lane, 2 w… Straight, l…       1 Center stripe/…
##  5 Clear        Dark, not lig… 2 lane, 2 w… Straight, g…       1 None           
##  6 Clear        Daylight       Unknown      Straight, l…       1 None           
##  7 Clear        Daylight       4 or more l… Curve, level       1 Marked lanes   
##  8 Clear        Daylight       4 or more l… Curve, level       1 Marked lanes   
##  9 Clear        Daylight       Unknown      Straight, l…       1 None           
## 10 Clear        Daylight       2 lane, 2 w… Straight, l…       1 Center stripe/…
## # ℹ 1,285 more rows
## # ℹ 14 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## #   FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## #   Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## #   Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## #   Prsn_Injry_Sev_ID <chr>, Trafficnature <fct>
head(testdata)
## # A tibble: 6 × 19
##   Wthr_Cond_ID Light_Cond_ID   Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
##   <chr>        <chr>           <chr>        <chr>          <dbl> <chr>          
## 1 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Marked lanes   
## 2 Clear        Dark, not ligh… 2 lane, 2 w… Straight, l…       1 Center stripe/…
## 3 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Marked lanes   
## 4 Clear        Daylight        2 lane, 2 w… Straight, l…       1 Center stripe/…
## 5 Clear        Dark, not ligh… 2 lane, 2 w… Straight, g…       1 None           
## 6 Clear        Daylight        Unknown      Straight, l…       1 None           
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## #   FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## #   Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## #   Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## #   Prsn_Injry_Sev_ID <chr>

Piping with new data

testdata2=testdata2 %>%mutate(Natureoftraffic=factor(ScaledTrafficVolume>0, labels=c("low","high")))
tail(testdata2)
## # A tibble: 6 × 3
##   ScaledTrafficVolume[,1] Roadalignment   Natureoftraffic
##                     <dbl> <chr>           <fct>          
## 1                  1.05   Straight, level high           
## 2                 -0.0683 Straight, level low            
## 3                  1.52   Straight, level high           
## 4                  1.15   Straight, level high           
## 5                 -0.623  Straight, level low            
## 6                 -0.596  Straight, level low

Pivot wider example

PivotDatawide <-  pivot_wider(testdata,names_from = "Traffic_Cntl_ID",values_from ="Light_Cond_ID")
tail(PivotDatawide)
## # A tibble: 6 × 22
##   Wthr_Cond_ID Road_Type_ID   Road_Algn_ID SurfDry Harm_Evnt_ID Intrsct_Relat_ID
##   <chr>        <chr>          <chr>          <dbl> <chr>        <chr>           
## 1 Clear        4 or more lan… Straight, l…       1 Motor vehic… Intersection    
## 2 Clear        2 lane, 2 way  Straight, l…       1 Motor vehic… Non intersection
## 3 Clear        4 or more lan… Straight, l…       1 Fixed object Non intersection
## 4 Clear        4 or more lan… Straight, l…       1 Motor vehic… Intersection re…
## 5 Clear        2 lane, 2 way  Straight, l…       1 Motor vehic… Non intersection
## 6 Cloudy       Unknown        Straight, l…       1 Motor vehic… Intersection    
## # ℹ 16 more variables: FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>,
## #   Road_Cls_ID <chr>, Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>,
## #   Veh_Body_Styl_ID <chr>, Prsn_Ethnicity_ID <chr>, GenMale <dbl>,
## #   TrafVol <dbl>, Prsn_Age <chr>, Prsn_Injry_Sev_ID <chr>,
## #   `Marked lanes` <chr>, `Center stripe/divider` <chr>, None <chr>,
## #   Other <chr>, `No passing zone` <chr>

PLOTS

#Scatterplot for traffic volume vs Weather condition

ggplot(testdata, aes(x=Wthr_Cond_ID, y=TrafVol)) +
geom_point(color="lightblue") +
theme_bw() +labs(title="Traffic Volume vs weather condition", x="weather condition", y="Traffic Volume")

#Boxplot for Lighting condition vs traffic volume
ggplot(testdata, aes(x=Light_Cond_ID, y=TrafVol)) +
    geom_boxplot(fill="lightpink") +
    theme_minimal() +
    labs(title="Traffic Volume vs lighting condition", x="lighting condition", y="Traffic Volume")

#Barplot for road alignment by weather condition
ggplot(testdata, aes(x=Road_Algn_ID, fill=Wthr_Cond_ID)) +
    geom_bar(position="stack") +
    theme_minimal() +
 labs(title="road alignment by weather condition", x="Road alignment", y="Count", fill="weather condition")

#Heatmap for person ethnicity vs injury severity
library(reshape2)
File <- table(testdata$Prsn_Ethnicity_ID,testdata$Prsn_Injry_Sev_ID)
dataframe <- melt(File)
 ggplot(dataframe, aes(x=Var1, y=Var2, fill=value)) +
 geom_tile() + theme_minimal() + labs(title="Heatmap of person ethnicity vs Injury Severity", x="Person ethnicity", y="Injury Severity")

#Histogram for Person age
ggplot(testdata,aes(x=Prsn_Age)) + geom_histogram(stat="count",fill="lightblue",color="blue")+theme_bw()+labs(title="Histogram for person age",x="Person age",y="frequency")
## Warning in geom_histogram(stat = "count", fill = "lightblue", color = "blue"):
## Ignoring unknown parameters: `binwidth`, `bins`, and `pad`

#Heatmap for road type vs Vehicle body style
library(reshape2)
File2 <- table(testdata$Road_Type_ID,testdata$Veh_Body_Styl_ID)
dataframe2 <- melt(File2)
ggplot(dataframe2, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() + theme_bw() + labs(title="Heatmap of Road Type vs Vehicle body style", x="Road Type", y="vehicle body style")

#histogram for Person ethnicity
ggplot(testdata,aes(x=Prsn_Ethnicity_ID)) + geom_histogram(stat="count",fill=rgb(0.9,0.5,0.5),color="black")+theme_bw()+labs(title="Histogram for person ethnicity",x="Person ethnicity",y="frequency")
## Warning in geom_histogram(stat = "count", fill = rgb(0.9, 0.5, 0.5), color =
## "black"): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`

Conclusion: White people are the one having most crashes.25-54 age group people have high rate of being involved in road crashes.White and hispanic people have high correlation with Injury severity type O.