Week 3 Assignment

Input CSV

library(bitops)
library(plyr)
library(RCurl)

file_URL <- getURL("https://raw.githubusercontent.com/jey1987/Week3_Assignment/master/fars_new.csv")
input_dataset <- read.csv(text=file_URL)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : EOF within quoted string
input_dataset

Dataset Summary

summary(input_dataset)
##        X            caseid          state            age        
##  Min.   :1998   1:101:1:    2   Min.   : 1.00   Min.   :  0.00  
##  1st Qu.:1998   1:125:1:    2   1st Qu.:12.00   1st Qu.: 18.00  
##  Median :1999   1:129:1:    2   Median :25.00   Median : 27.00  
##  Mean   :1999   1:201:1:    2   Mean   :25.07   Mean   : 53.39  
##  3rd Qu.:1999   1:205:1:    2   3rd Qu.:37.00   3rd Qu.: 51.00  
##  Max.   :2000   1:219:1:    2   Max.   :56.00   Max.   :999.00  
##                 (Other):23406   NA's   :1       NA's   :1       
##      airbag          injury        restraint           sex       
##  Min.   : 1.00   Min.   :0.000   Min.   : 0.000   Min.   :1.000  
##  1st Qu.:29.00   1st Qu.:2.000   1st Qu.: 0.000   1st Qu.:1.000  
##  Median :30.00   Median :3.000   Median : 3.000   Median :2.000  
##  Mean   :35.37   Mean   :2.594   Mean   : 9.698   Mean   :1.572  
##  3rd Qu.:30.00   3rd Qu.:4.000   3rd Qu.: 3.000   3rd Qu.:2.000  
##  Max.   :99.00   Max.   :5.000   Max.   :99.000   Max.   :9.000  
##  NA's   :1       NA's   :1       NA's   :1        NA's   :1      
##     inimpact        modelyr      airbagAvail     airbagDeploy  
##  Min.   : 0.00   Min.   :1928          :    1          :    1  
##  1st Qu.: 3.00   1st Qu.:1988   NA-code: 3421   NA-code: 5054  
##  Median :11.00   Median :1992   no     :13600   no     :15071  
##  Mean   : 9.95   Mean   :2002   yes    : 6396   yes    : 3292  
##  3rd Qu.:12.00   3rd Qu.:1995                                  
##  Max.   :99.00   Max.   :9999                                  
##  NA's   :1       NA's   :1                                     
##    Restraint        D_injury     D_airbagAvail   D_airbagDeploy 
##         :    1   Min.   :0.000          :    1          :    1  
##  NA-code: 1837   1st Qu.:2.000   NA-code: 3150   NA-code: 5299  
##  no     : 8366   Median :3.000   no     :11452   no     :13511  
##  yes    :13214   Mean   :2.524   yes    : 8815   yes    : 4607  
##                  3rd Qu.:4.000                                  
##                  Max.   :5.000                                  
##                  NA's   :1                                      
##   D_Restraint         year     
##         :    1   Min.   :1998  
##  NA-code: 1879   1st Qu.:1998  
##  no     : 7960   Median :1998  
##  yes    :13578   Mean   :1998  
##                  3rd Qu.:1999  
##                  Max.   :1999  
##                  NA's   :1

Model Year Table Values

table(na.omit(input_dataset$modelyr))
## 
## 1928 1929 1930 1947 1948 1953 1955 1956 1957 1961 1962 1963 1964 1965 1966 
##    1    3    1    1    1    2    3    1    1    3    2    8   11   15   20 
## 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 
##   15   19   30   30   27   37   51   37   37   54   96  145  202  167  178 
## 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 
##  236  398  661  855 1103 1265 1503 1555 1438 1475 1440 1579 1618 1876 1463 
## 1997 1998 1999 2000 9999 
## 1509 1444  709   60   32

Mean

mean(na.omit(input_dataset$modelyr))
## [1] 2001.823

Median

median(na.omit(input_dataset$modelyr))
## [1] 1992

Standard Deviation

sd(na.omit(input_dataset$modelyr))
## [1] 295.8905

Absolute Median

mad(na.omit(input_dataset$modelyr))
## [1] 5.9304

Interquartile Range

IQR(na.omit(input_dataset$modelyr))
## [1] 7

Column Rename

data_frame_input <- data.frame(subset(input_dataset,age==999))
names(data_frame_input)[8] = "Gender"
data_frame_input

Column Revalue

data_frame_input$Restraint <- revalue(data_frame_input$Restraint,c("yes"="1"))
data_frame_input$Restraint <- revalue(data_frame_input$Restraint,c("no"="0"))
data_frame_input$Restraint <- revalue(data_frame_input$Restraint,c("NA-code"="999"))

data_frame_input

Derived Column

data_frame_input$Eligible="Hello"
data_frame_input$Eligible <- ifelse(data_frame_input$inimpact>5,"Eligible","InEligible")

data_frame_input

Top 10 Rows

head(data_frame_input,n=10)

Plots

require(ggplot2)
## Loading required package: ggplot2
boxplot(data_frame_input$airbag)

hist(data_frame_input$airbag)

qplot(airbag, data=data_frame_input)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot(data_frame_input$state ~ data_frame_input$inimpact)

Analytical Question

Question : How many injuries occurred before the model year 1980 and provide summary of injuries by each year. Plot the data accordingly.

data_frame_avg <- data.frame(subset(input_dataset,modelyr<1980))
sum_agg <- aggregate(data_frame_avg$injury, by=list(data_frame_avg$modelyr), FUN=sum)
model_yr <- data_frame_avg$modelyr
plot(sum_agg,model_yr)

Conclusion: Looking at the data the number of injuries were under control until the year 1960 and then it started increasing drastically reaching the peak at 1980. This suggests a need to look at the model years 1960 to 1980 for finding out what changes were made to the vehicles after 1960. Identifying this crucial information could reduce the number of injuries in future.