#Title: Homework of AI(Exploratory analysis)
#Install the packages
install.packages(“tidyverse)
install.packages(“dplyr)
install.packages(“readxl”)
install.packages(“ggplot2”)
install.package(“reshape2”)
options(repos=c(CRAN=“https://cran.rstudio.com”))
#loading the library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggplot2)
library(readxl)
#Loading data
testdata<-read_excel("C:/Users/rupes/Downloads/HW1_Data.xlsx")
head(testdata)
## # A tibble: 6 × 19
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Clear Dark, not ligh… 2 lane, 2 w… Straight, l… 1 Marked lanes
## 2 Clear Dark, not ligh… 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 3 Clear Daylight 2 lane, 2 w… Straight, l… 1 Marked lanes
## 4 Clear Daylight 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 5 Clear Dark, not ligh… 2 lane, 2 w… Straight, g… 1 None
## 6 Clear Daylight Unknown Straight, l… 1 None
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## # FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## # Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## # Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## # Prsn_Injry_Sev_ID <chr>
#searching NA data
sum(is.na(testdata))
## [1] 0
#Summary
summary(testdata)
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID
## Length:1295 Length:1295 Length:1295 Length:1295
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## SurfDry Traffic_Cntl_ID Harm_Evnt_ID Intrsct_Relat_ID
## Min. :0.0000 Length:1295 Length:1295 Length:1295
## 1st Qu.:1.0000 Class :character Class :character Class :character
## Median :1.0000 Mode :character Mode :character Mode :character
## Mean :0.9143
## 3rd Qu.:1.0000
## Max. :1.0000
## FHE_Collsn_ID Road_Part_Adj_ID Road_Cls_ID Pop_Group_ID
## Length:1295 Length:1295 Length:1295 Length:1295
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Crash_Speed_LimitCat Veh_Body_Styl_ID Prsn_Ethnicity_ID GenMale
## Length:1295 Length:1295 Length:1295 Min. :0.0000
## Class :character Class :character Class :character 1st Qu.:1.0000
## Mode :character Mode :character Mode :character Median :1.0000
## Mean :0.8842
## 3rd Qu.:1.0000
## Max. :1.0000
## TrafVol Prsn_Age Prsn_Injry_Sev_ID
## Min. : 215 Length:1295 Length:1295
## 1st Qu.: 6765 Class :character Class :character
## Median :14170 Mode :character Mode :character
## Mean :14417
## 3rd Qu.:22010
## Max. :28970
str(testdata)
## tibble [1,295 × 19] (S3: tbl_df/tbl/data.frame)
## $ Wthr_Cond_ID : chr [1:1295] "Clear" "Clear" "Clear" "Clear" ...
## $ Light_Cond_ID : chr [1:1295] "Dark, not lighted" "Dark, not lighted" "Daylight" "Daylight" ...
## $ Road_Type_ID : chr [1:1295] "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" "2 lane, 2 way" ...
## $ Road_Algn_ID : chr [1:1295] "Straight, level" "Straight, level" "Straight, level" "Straight, level" ...
## $ SurfDry : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
## $ Traffic_Cntl_ID : chr [1:1295] "Marked lanes" "Center stripe/divider" "Marked lanes" "Center stripe/divider" ...
## $ Harm_Evnt_ID : chr [1:1295] "Motor vehicle in transport" "Motor vehicle in transport" "Motor vehicle in transport" "Fixed object" ...
## $ Intrsct_Relat_ID : chr [1:1295] "Non intersection" "Non intersection" "Intersection" "Non intersection" ...
## $ FHE_Collsn_ID : chr [1:1295] "Sd both going straight-rear end" "Sd both going straight-rear end" "Other" "Omv vehicle going straight" ...
## $ Road_Part_Adj_ID : chr [1:1295] "Main/proper lane" "Main/proper lane" "Main/proper lane" "Main/proper lane" ...
## $ Road_Cls_ID : chr [1:1295] "Farm to market" "Us & state highways" "Farm to market" "Us & state highways" ...
## $ Pop_Group_ID : chr [1:1295] "10,000 - 24,999 pop" "Rural" "Other" "Rural" ...
## $ Crash_Speed_LimitCat: chr [1:1295] "30-40 mph" "65-70 mph" "45-60 mph" "65-70 mph" ...
## $ Veh_Body_Styl_ID : chr [1:1295] "Farm equipment" "Farm equipment" "Farm equipment" "Farm equipment" ...
## $ Prsn_Ethnicity_ID : chr [1:1295] "White" "White" "White" "White" ...
## $ GenMale : num [1:1295] 1 1 1 1 1 1 1 1 1 1 ...
## $ TrafVol : num [1:1295] 7654 13770 11470 16972 413 ...
## $ Prsn_Age : chr [1:1295] "25-54 years" "25-54 years" "Other" "25-54 years" ...
## $ Prsn_Injry_Sev_ID : chr [1:1295] "O" "O" "O" "O" ...
#General operations for viewing dataset
#looking values of data from lighting condition to surface condition
head(select(testdata,Light_Cond_ID:SurfDry))
## # A tibble: 6 × 4
## Light_Cond_ID Road_Type_ID Road_Algn_ID SurfDry
## <chr> <chr> <chr> <dbl>
## 1 Dark, not lighted 2 lane, 2 way Straight, level 1
## 2 Dark, not lighted 2 lane, 2 way Straight, level 1
## 3 Daylight 2 lane, 2 way Straight, level 1
## 4 Daylight 2 lane, 2 way Straight, level 1
## 5 Dark, not lighted 2 lane, 2 way Straight, grade 1
## 6 Daylight Unknown Straight, level 1
# presenting data for Gender female
head(filter(testdata, GenMale ==0))
## # A tibble: 6 × 19
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Cloudy Daylight 2 lane, 2 w… Straight, l… 1 Marked lanes
## 2 Clear Dark, not ligh… 2 lane, 2 w… Straight, g… 1 Marked lanes
## 3 Clear Daylight 2 lane, 2 w… Straight, g… 1 Center stripe/…
## 4 Rain Daylight Unknown Straight, l… 0 None
## 5 Cloudy Daylight Unknown Straight, l… 1 None
## 6 Clear Daylight Unknown Curve, level 0 None
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## # FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## # Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## # Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## # Prsn_Injry_Sev_ID <chr>
# Arranging data in ascending order based on person age
head(data <- arrange(testdata, Prsn_Age))
## # A tibble: 6 × 19
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Clear Daylight 4 or more l… Curve, level 1 Marked lanes
## 2 Clear Daylight 2 lane, 2 w… Straight, g… 1 Other
## 3 Cloudy Dark, not ligh… Unknown Straight, l… 1 None
## 4 Clear Daylight 2 lane, 2 w… Straight, l… 1 Marked lanes
## 5 Cloudy Dark, not ligh… Unknown Straight, l… 0 None
## 6 Clear Daylight 2 lane, 2 w… Straight, l… 1 Center stripe/…
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## # FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## # Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## # Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## # Prsn_Injry_Sev_ID <chr>
#Variable name renaming
head(data <- rename(testdata, Harmevent = "Harm_Evnt_ID"))
## # A tibble: 6 × 19
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Clear Dark, not ligh… 2 lane, 2 w… Straight, l… 1 Marked lanes
## 2 Clear Dark, not ligh… 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 3 Clear Daylight 2 lane, 2 w… Straight, l… 1 Marked lanes
## 4 Clear Daylight 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 5 Clear Dark, not ligh… 2 lane, 2 w… Straight, g… 1 None
## 6 Clear Daylight Unknown Straight, l… 1 None
## # ℹ 13 more variables: Harmevent <chr>, Intrsct_Relat_ID <chr>,
## # FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## # Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## # Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## # Prsn_Injry_Sev_ID <chr>
#Creating new dataset using transmute
testdata2 <- transmute(testdata, ScaledTrafficVolume = scale(TrafVol),Roadalignment=Road_Algn_ID)
head(testdata2)
## # A tibble: 6 × 2
## ScaledTrafficVolume[,1] Roadalignment
## <dbl> <chr>
## 1 -0.787 Straight, level
## 2 -0.0753 Straight, level
## 3 -0.343 Straight, level
## 4 0.297 Straight, level
## 5 -1.63 Straight, grade
## 6 -1.34 Straight, level
testdata %>% mutate( Trafficnature= factor(TrafVol>7000,labels=c("low","high")))
## # A tibble: 1,295 × 20
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Clear Dark, not lig… 2 lane, 2 w… Straight, l… 1 Marked lanes
## 2 Clear Dark, not lig… 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 3 Clear Daylight 2 lane, 2 w… Straight, l… 1 Marked lanes
## 4 Clear Daylight 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 5 Clear Dark, not lig… 2 lane, 2 w… Straight, g… 1 None
## 6 Clear Daylight Unknown Straight, l… 1 None
## 7 Clear Daylight 4 or more l… Curve, level 1 Marked lanes
## 8 Clear Daylight 4 or more l… Curve, level 1 Marked lanes
## 9 Clear Daylight Unknown Straight, l… 1 None
## 10 Clear Daylight 2 lane, 2 w… Straight, l… 1 Center stripe/…
## # ℹ 1,285 more rows
## # ℹ 14 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## # FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## # Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## # Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## # Prsn_Injry_Sev_ID <chr>, Trafficnature <fct>
head(testdata)
## # A tibble: 6 × 19
## Wthr_Cond_ID Light_Cond_ID Road_Type_ID Road_Algn_ID SurfDry Traffic_Cntl_ID
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Clear Dark, not ligh… 2 lane, 2 w… Straight, l… 1 Marked lanes
## 2 Clear Dark, not ligh… 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 3 Clear Daylight 2 lane, 2 w… Straight, l… 1 Marked lanes
## 4 Clear Daylight 2 lane, 2 w… Straight, l… 1 Center stripe/…
## 5 Clear Dark, not ligh… 2 lane, 2 w… Straight, g… 1 None
## 6 Clear Daylight Unknown Straight, l… 1 None
## # ℹ 13 more variables: Harm_Evnt_ID <chr>, Intrsct_Relat_ID <chr>,
## # FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>, Road_Cls_ID <chr>,
## # Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>, Veh_Body_Styl_ID <chr>,
## # Prsn_Ethnicity_ID <chr>, GenMale <dbl>, TrafVol <dbl>, Prsn_Age <chr>,
## # Prsn_Injry_Sev_ID <chr>
testdata2=testdata2 %>%mutate(Natureoftraffic=factor(ScaledTrafficVolume>0, labels=c("low","high")))
tail(testdata2)
## # A tibble: 6 × 3
## ScaledTrafficVolume[,1] Roadalignment Natureoftraffic
## <dbl> <chr> <fct>
## 1 1.05 Straight, level high
## 2 -0.0683 Straight, level low
## 3 1.52 Straight, level high
## 4 1.15 Straight, level high
## 5 -0.623 Straight, level low
## 6 -0.596 Straight, level low
PivotDatawide <- pivot_wider(testdata,names_from = "Traffic_Cntl_ID",values_from ="Light_Cond_ID")
tail(PivotDatawide)
## # A tibble: 6 × 22
## Wthr_Cond_ID Road_Type_ID Road_Algn_ID SurfDry Harm_Evnt_ID Intrsct_Relat_ID
## <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 Clear 4 or more lan… Straight, l… 1 Motor vehic… Intersection
## 2 Clear 2 lane, 2 way Straight, l… 1 Motor vehic… Non intersection
## 3 Clear 4 or more lan… Straight, l… 1 Fixed object Non intersection
## 4 Clear 4 or more lan… Straight, l… 1 Motor vehic… Intersection re…
## 5 Clear 2 lane, 2 way Straight, l… 1 Motor vehic… Non intersection
## 6 Cloudy Unknown Straight, l… 1 Motor vehic… Intersection
## # ℹ 16 more variables: FHE_Collsn_ID <chr>, Road_Part_Adj_ID <chr>,
## # Road_Cls_ID <chr>, Pop_Group_ID <chr>, Crash_Speed_LimitCat <chr>,
## # Veh_Body_Styl_ID <chr>, Prsn_Ethnicity_ID <chr>, GenMale <dbl>,
## # TrafVol <dbl>, Prsn_Age <chr>, Prsn_Injry_Sev_ID <chr>,
## # `Marked lanes` <chr>, `Center stripe/divider` <chr>, None <chr>,
## # Other <chr>, `No passing zone` <chr>
#Scatterplot for traffic volume vs Weather condition
ggplot(testdata, aes(x=Wthr_Cond_ID, y=TrafVol)) +
geom_point(color="lightblue") +
theme_bw() +labs(title="Traffic Volume vs weather condition", x="weather condition", y="Traffic Volume")
#Boxplot for Lighting condition vs traffic volume
ggplot(testdata, aes(x=Light_Cond_ID, y=TrafVol)) +
geom_boxplot(fill="lightpink") +
theme_minimal() +
labs(title="Traffic Volume vs lighting condition", x="lighting condition", y="Traffic Volume")
#Barplot for road alignment by weather condition
ggplot(testdata, aes(x=Road_Algn_ID, fill=Wthr_Cond_ID)) +
geom_bar(position="stack") +
theme_minimal() +
labs(title="road alignment by weather condition", x="Road alignment", y="Count", fill="weather condition")
#Heatmap for person ethnicity vs injury severity
library(reshape2)
File <- table(testdata$Prsn_Ethnicity_ID,testdata$Prsn_Injry_Sev_ID)
dataframe <- melt(File)
ggplot(dataframe, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() + theme_minimal() + labs(title="Heatmap of person ethnicity vs Injury Severity", x="Person ethnicity", y="Injury Severity")
#Histogram for Person age
ggplot(testdata,aes(x=Prsn_Age)) + geom_histogram(stat="count",fill="lightblue",color="blue")+theme_bw()+labs(title="Histogram for person age",x="Person age",y="frequency")
## Warning in geom_histogram(stat = "count", fill = "lightblue", color = "blue"):
## Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
#Heatmap for road type vs Vehicle body style
library(reshape2)
File2 <- table(testdata$Road_Type_ID,testdata$Veh_Body_Styl_ID)
dataframe2 <- melt(File2)
ggplot(dataframe2, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() + theme_bw() + labs(title="Heatmap of Road Type vs Vehicle body style", x="Road Type", y="vehicle body style")
#histogram for Person ethnicity
ggplot(testdata,aes(x=Prsn_Ethnicity_ID)) + geom_histogram(stat="count",fill=rgb(0.9,0.5,0.5),color="black")+theme_bw()+labs(title="Histogram for person ethnicity",x="Person ethnicity",y="frequency")
## Warning in geom_histogram(stat = "count", fill = rgb(0.9, 0.5, 0.5), color =
## "black"): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
Conclusion: White people are the one having most crashes.25-54 age group people have high rate of being involved in road crashes.White and hispanic people have high correlation with Injury severity type O.