# install plyr and dplyr packages
library(plyr)
library(dplyr)
# load dataset from either of 2 file locations: C drive or GitHub
# location <- "C:/Users/Kevin/Documents/MSDS/Week2 Assignments/NASScds.csv"
# location <- "https://raw.githubusercontent.com/kecbenson/Bridge_Program/master/nassCDS.csv"
# use GitHub file for this assignment
location <- "https://raw.githubusercontent.com/kecbenson/Bridge_Program/master/nassCDS.csv"
df <- as.tbl(read.csv(file=location))
df
## # A tibble: 26,217 x 16
## X dvcat weight dead airbag seatbelt frontal sex ageOFocc yearacc
## <int> <fct> <dbl> <fct> <fct> <fct> <int> <fct> <int> <int>
## 1 1 25-39 25.1 alive none belted 1 f 26 1997
## 2 2 10-24 25.1 alive airbag belted 1 f 72 1997
## 3 3 10-24 32.4 alive none none 1 f 69 1997
## 4 4 25-39 495. alive airbag belted 1 f 53 1997
## 5 5 25-39 25.1 alive none belted 1 f 32 1997
## 6 6 40-54 25.1 alive none belted 1 f 22 1997
## 7 7 55+ 27.1 alive none belted 1 m 22 1997
## 8 8 55+ 27.1 dead none none 1 m 32 1997
## 9 9 10-24 813. alive none belted 0 m 40 1997
## 10 10 10-24 813. alive none belted 1 f 18 1997
## # ... with 26,207 more rows, and 6 more variables: yearVeh <int>,
## # abcat <fct>, occRole <fct>, deploy <int>, injSeverity <int>,
## # caseid <fct>
summary(df)
## X dvcat weight dead
## Min. : 1 1-9km/h: 686 Min. : 0.00 alive:25037
## 1st Qu.: 6555 10-24 :12848 1st Qu.: 32.47 dead : 1180
## Median :13109 25-39 : 8214 Median : 86.99
## Mean :13109 40-54 : 2977 Mean : 462.81
## 3rd Qu.:19663 55+ : 1492 3rd Qu.: 364.72
## Max. :26217 Max. :57871.59
##
## airbag seatbelt frontal sex ageOFocc
## airbag:14419 belted:18573 Min. :0.0000 f:12248 Min. :16.00
## none :11798 none : 7644 1st Qu.:0.0000 m:13969 1st Qu.:22.00
## Median :1.0000 Median :33.00
## Mean :0.6433 Mean :37.21
## 3rd Qu.:1.0000 3rd Qu.:48.00
## Max. :1.0000 Max. :97.00
##
## yearacc yearVeh abcat occRole
## Min. :1997 Min. :1953 deploy : 8836 driver:20601
## 1st Qu.:1998 1st Qu.:1989 nodeploy: 5583 pass : 5616
## Median :2000 Median :1994 unavail :11798
## Mean :2000 Mean :1993
## 3rd Qu.:2001 3rd Qu.:1997
## Max. :2002 Max. :2003
## NA's :1
## deploy injSeverity caseid
## Min. :0.000 Min. :0.000 13:157:1: 11
## 1st Qu.:0.000 1st Qu.:1.000 11:131:1: 10
## Median :0.000 Median :2.000 2:94:1 : 10
## Mean :0.337 Mean :1.716 12:50:1 : 9
## 3rd Qu.:1.000 3rd Qu.:3.000 41:49:1 : 9
## Max. :1.000 Max. :6.000 43:169:1: 9
## NA's :153 (Other) :26159
# calculate mean & median of ageOFocc & yearVeh
paste("Age of Occupant: Mean = ", sprintf("%0.1f", mean(df$ageOFocc)), ", Med = ", sprintf("%0.1f", median(df$ageOFocc)))
## [1] "Age of Occupant: Mean = 37.2 , Med = 33.0"
paste("Year of Vehicle: Mean = ", sprintf("%0.1f", mean(df$yearVeh, na.rm=TRUE)), ", Med = ", sprintf("%0.1f", median(df$yearVeh, na.rm=TRUE)))
## [1] "Year of Vehicle: Mean = 1992.8 , Med = 1994.0"
# or can use summarize function for mean & median of ageOFocc & yearVeh
summarize(df, AvgAge=mean(ageOFocc), MedAge=median(ageOFocc), AvgYrVeh=mean(yearVeh, na.rm=TRUE), MedYrVeh=median(yearVeh, na.rm=TRUE))
## # A tibble: 1 x 4
## AvgAge MedAge AvgYrVeh MedYrVeh
## <dbl> <int> <dbl> <dbl>
## 1 37.2 33 1993. 1994
Create a new data frame with a subset of the columns and rows. Make sure to rename it.
# filter on accidents that resulted in fatalities, then reduce columns to more compact set of variables
df2 <- filter(df, dead == 'dead')
df3 <- select(df2, dvcat, airbag, seatbelt, frontal, sex, ageOFocc, yearVeh, abcat, occRole, deploy)
df3
## # A tibble: 1,180 x 10
## dvcat airbag seatbelt frontal sex ageOFocc yearVeh abcat occRole
## <fct> <fct> <fct> <int> <fct> <int> <int> <fct> <fct>
## 1 55+ none none 1 m 32 1987 unavail driver
## 2 25-39 airbag belted 0 f 54 1994 nodeploy driver
## 3 55+ none belted 1 m 67 1992 unavail driver
## 4 55+ none belted 1 f 64 1992 unavail pass
## 5 55+ none none 1 m 23 1986 unavail driver
## 6 25-39 none belted 0 f 77 1992 unavail pass
## 7 55+ none none 0 m 34 1990 unavail driver
## 8 10-24 airbag belted 1 m 37 1995 deploy driver
## 9 40-54 none none 0 f 33 1987 unavail driver
## 10 25-39 airbag none 1 f 84 1996 deploy driver
## # ... with 1,170 more rows, and 1 more variable: deploy <int>
summary(df3)
## dvcat airbag seatbelt frontal sex
## 1-9km/h: 3 airbag:511 belted:500 Min. :0.0000 f:464
## 10-24 :114 none :669 none :680 1st Qu.:0.0000 m:716
## 25-39 :304 Median :0.0000
## 40-54 :344 Mean :0.4932
## 55+ :415 3rd Qu.:1.0000
## Max. :1.0000
## ageOFocc yearVeh abcat occRole
## Min. :16.00 Min. :1963 deploy :368 driver:891
## 1st Qu.:25.00 1st Qu.:1988 nodeploy:143 pass :289
## Median :41.00 Median :1992 unavail :669
## Mean :44.62 Mean :1991
## 3rd Qu.:61.00 3rd Qu.:1996
## Max. :97.00 Max. :2003
## deploy
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3119
## 3rd Qu.:1.0000
## Max. :1.0000
# or can use subset function to do the same thing
df4 <- subset(df, dead == 'dead', select = c(dvcat, airbag, seatbelt, frontal, sex, ageOFocc, yearVeh, abcat, occRole, deploy))
summary(df4)
## dvcat airbag seatbelt frontal sex
## 1-9km/h: 3 airbag:511 belted:500 Min. :0.0000 f:464
## 10-24 :114 none :669 none :680 1st Qu.:0.0000 m:716
## 25-39 :304 Median :0.0000
## 40-54 :344 Mean :0.4932
## 55+ :415 3rd Qu.:1.0000
## Max. :1.0000
## ageOFocc yearVeh abcat occRole
## Min. :16.00 Min. :1963 deploy :368 driver:891
## 1st Qu.:25.00 1st Qu.:1988 nodeploy:143 pass :289
## Median :41.00 Median :1992 unavail :669
## Mean :44.62 Mean :1991
## 3rd Qu.:61.00 3rd Qu.:1996
## Max. :97.00 Max. :2003
## deploy
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3119
## 3rd Qu.:1.0000
## Max. :1.0000
# or can use index subsetting in df directly
df5 <- df[df$dead == 'dead', c("dvcat", "airbag", "seatbelt", "frontal", "sex", "ageOFocc", "yearVeh", "abcat", "occRole", "deploy")]
summary(df5)
## dvcat airbag seatbelt frontal sex
## 1-9km/h: 3 airbag:511 belted:500 Min. :0.0000 f:464
## 10-24 :114 none :669 none :680 1st Qu.:0.0000 m:716
## 25-39 :304 Median :0.0000
## 40-54 :344 Mean :0.4932
## 55+ :415 3rd Qu.:1.0000
## Max. :1.0000
## ageOFocc yearVeh abcat occRole
## Min. :16.00 Min. :1963 deploy :368 driver:891
## 1st Qu.:25.00 1st Qu.:1988 nodeploy:143 pass :289
## Median :41.00 Median :1992 unavail :669
## Mean :44.62 Mean :1991
## 3rd Qu.:61.00 3rd Qu.:1996
## Max. :97.00 Max. :2003
## deploy
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3119
## 3rd Qu.:1.0000
## Max. :1.0000
# check that df3, df4 and df5 are identical
identical(df3, df4)
## [1] TRUE
identical(df4, df5)
## [1] TRUE
Create new column names for the new data frame.
names(df3) <- c("Speed", "Airbag", "Seatbelt", "Front", "Gender", "Age", "YrVeh", "ABStatus", "Role", "ABDeploy")
df3
## # A tibble: 1,180 x 10
## Speed Airbag Seatbelt Front Gender Age YrVeh ABStatus Role ABDeploy
## <fct> <fct> <fct> <int> <fct> <int> <int> <fct> <fct> <int>
## 1 55+ none none 1 m 32 1987 unavail driver 0
## 2 25-39 airbag belted 0 f 54 1994 nodeploy driver 0
## 3 55+ none belted 1 m 67 1992 unavail driver 0
## 4 55+ none belted 1 f 64 1992 unavail pass 0
## 5 55+ none none 1 m 23 1986 unavail driver 0
## 6 25-39 none belted 0 f 77 1992 unavail pass 0
## 7 55+ none none 0 m 34 1990 unavail driver 0
## 8 10-24 airbag belted 1 m 37 1995 deploy driver 1
## 9 40-54 none none 0 f 33 1987 unavail driver 0
## 10 25-39 airbag none 1 f 84 1996 deploy driver 1
## # ... with 1,170 more rows
Use the summary function to create an overview of your new data frame. The print the mean and median for the same two attributes. Please compare.
summary(df3)
## Speed Airbag Seatbelt Front Gender
## 1-9km/h: 3 airbag:511 belted:500 Min. :0.0000 f:464
## 10-24 :114 none :669 none :680 1st Qu.:0.0000 m:716
## 25-39 :304 Median :0.0000
## 40-54 :344 Mean :0.4932
## 55+ :415 3rd Qu.:1.0000
## Max. :1.0000
## Age YrVeh ABStatus Role
## Min. :16.00 Min. :1963 deploy :368 driver:891
## 1st Qu.:25.00 1st Qu.:1988 nodeploy:143 pass :289
## Median :41.00 Median :1992 unavail :669
## Mean :44.62 Mean :1991
## 3rd Qu.:61.00 3rd Qu.:1996
## Max. :97.00 Max. :2003
## ABDeploy
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3119
## 3rd Qu.:1.0000
## Max. :1.0000
summarize(df3, AvgAge=mean(Age), MedAge=median(Age),AvgYrVeh=mean(YrVeh, na.rm=TRUE), MedYrVeh=median(YrVeh, na.rm=TRUE))
## # A tibble: 1 x 4
## AvgAge MedAge AvgYrVeh MedYrVeh
## <dbl> <dbl> <dbl> <dbl>
## 1 44.6 41 1991. 1992
The table below compares the occupant age and vehicle year statistics between the overall dataset (from Question 1) and the filtered dataset (this question) that only contains accidents that resulted in fatalities.
| Dataset | AGE Mean | AGE Median | YR_VEH Mean | YR_VEH Median |
|---|---|---|---|---|
| Overall | 37.2 | 33.0 | 1993 | 1994 |
| Fatalities | 44.6 | 41.0 | 1991 | 1992 |
The older age and earlier vehicle year for fatalities suggests that older drivers and older cars are correlated with more severe accidents. This may arise, for instance, because older cars (which typically are associated with older drivers) generally have fewer safety features. For instance airbags became mandatory in the US in 1998. Also older drivers may have slower reflexes to avoid accidents, etc.
For at least 3 values in a column please rename so that every value in that column is renamed. For example, suppose I have 20 values of the letter “e” in one column. Rename those values so that all 20 would show as “excellent”.
# change labels in the Speed column
# first see what the variable type is for the Speed column
str(df3$Speed)
## Factor w/ 5 levels "1-9km/h","10-24",..: 5 3 5 5 5 3 5 2 4 3 ...
levels(df3$Speed)
## [1] "1-9km/h" "10-24" "25-39" "40-54" "55+"
# change variable type to character so we can edit the values
df3$Speed <- as.character(df3$Speed)
str(df3$Speed)
## chr [1:1180] "55+" "25-39" "55+" "55+" "55+" "25-39" ...
# change Speed labels
df3$Speed[ df3$Speed == '1-9km/h' ] <- "Under10"
df3$Speed[ df3$Speed == '10-24' ] <- "Slow"
df3$Speed[ df3$Speed == '25-39' ] <- "Med"
df3$Speed[ df3$Speed == '40-54' ] <- "Fast"
df3$Speed[ df3$Speed == '55+' ] <- "Over55"
df3
## # A tibble: 1,180 x 10
## Speed Airbag Seatbelt Front Gender Age YrVeh ABStatus Role ABDeploy
## <chr> <fct> <fct> <int> <fct> <int> <int> <fct> <fct> <int>
## 1 Over55 none none 1 m 32 1987 unavail driv~ 0
## 2 Med airbag belted 0 f 54 1994 nodeploy driv~ 0
## 3 Over55 none belted 1 m 67 1992 unavail driv~ 0
## 4 Over55 none belted 1 f 64 1992 unavail pass 0
## 5 Over55 none none 1 m 23 1986 unavail driv~ 0
## 6 Med none belted 0 f 77 1992 unavail pass 0
## 7 Over55 none none 0 m 34 1990 unavail driv~ 0
## 8 Slow airbag belted 1 m 37 1995 deploy driv~ 1
## 9 Fast none none 0 f 33 1987 unavail driv~ 0
## 10 Med airbag none 1 f 84 1996 deploy driv~ 1
## # ... with 1,170 more rows
summary(df3)
## Speed Airbag Seatbelt Front Gender
## Length:1180 airbag:511 belted:500 Min. :0.0000 f:464
## Class :character none :669 none :680 1st Qu.:0.0000 m:716
## Mode :character Median :0.0000
## Mean :0.4932
## 3rd Qu.:1.0000
## Max. :1.0000
## Age YrVeh ABStatus Role
## Min. :16.00 Min. :1963 deploy :368 driver:891
## 1st Qu.:25.00 1st Qu.:1988 nodeploy:143 pass :289
## Median :41.00 Median :1992 unavail :669
## Mean :44.62 Mean :1991
## 3rd Qu.:61.00 3rd Qu.:1996
## Max. :97.00 Max. :2003
## ABDeploy
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3119
## 3rd Qu.:1.0000
## Max. :1.0000
Display enough rows to see examples of all of steps 1-5 above.
This is provided in the tibble outputs in the answers above.
BONUS - place the original .csv in a github file and have R read from the link. This will be a very useful skill as you progress in your data science education and career.
This is provided in the setup and data loading section above, where the file location can be set to either the C: drive or to my GitHub repository (file paths are copied below). This assignment was run using the GitHub file.
# load dataset from either of 2 file locations: C drive or GitHub
# location <- "C:/Users/Kevin/Documents/MSDS/Week2 Assignments/NASScds.csv"
# location <- "https://raw.githubusercontent.com/kecbenson/Bridge_Program/master/nassCDS.csv"