Assignment 3

An investigation of the correlation between Population and Road Accident in India

Nishant Dudhwala (S3752868)

Last updated: 24 October, 2019

Introduction

Problem Statement

\[ H_0: \ The \ data \ sets \ variables \ does \ not \ fit \ the \ linear \ regression \ model\] \[ H_A: \ The \ data \ sets \ variables \ does \ fit \ the \ linear \ regression \ model\]

Data

Descriptive Statistics and Visualisation

Road_accident %>% summarise(
"Min"=min(Road_accident$Population_of_India,na.rm=TRUE),
"Q1"=quantile(Road_accident$Population_of_India ,probs = .25,na.rm=TRUE),
"Median"=median(Road_accident$Population_of_India ,na.rm = TRUE),
"Q3"=quantile(Road_accident$Population_of_India ,probs = .75,na.rm=TRUE),
"Max"=max(Road_accident$Population_of_India ,na.rm = TRUE),
"Mean"=mean(Road_accident$Population_of_India ,na.rm = TRUE),
"SD"=sd(Road_accident$Population_of_India ,na.rm = TRUE),
n=n()
)-> t1
knitr::kable(t1)
Min Q1 Median Q3 Max Mean SD n
1014825 1083268 1152774 1220234 1283601 1152031 85878.86 18

Decsriptive Statistics Cont.

Road_accident %>% summarise(
"Lobound"=(quantile(Road_accident$Population_of_India ,probs = .25) 
           -1.5*IQR(Road_accident$Population_of_India)),
"NumLoOuts"=sum(Road_accident$Population_of_India < Lobound)%>%round(0),
"SD_Min2Mean"=((mean(Road_accident$Population_of_India)
                -min(Road_accident$Population_of_India))/sd(Road_accident$Population_of_India)),
"Upbound"=(quantile(Road_accident$Population_of_India ,probs = .75) +
             1.5*IQR(Road_accident$Population_of_India)),
"NumUpOuts"=sum(Road_accident$Population_of_India > Upbound)%>%round(0),
"Mean"=mean(Road_accident$Population_of_India ,na.rm = TRUE),
"SD_Mean2Max"=((max(Road_accident$Population_of_India)
                -mean(Road_accident&Population_of_India))/sd(Road_accident$Population_of_India)),
n=n()
)->t2
knitr::kable(t2)
Lobound NumLoOuts SD_Min2Mean Upbound NumUpOuts Mean SD_Mean2Max n
877819.5 0 1.597673 1425683 0 1152031 NA 18

Decsriptive Statistics Cont.

Road_accident %>% summarise(
"Min"=min(Road_accident$Total_Number_of_Road_Accident,na.rm=TRUE),
"Q1"=quantile(Road_accident$Total_Number_of_Road_Accident ,probs = .25,na.rm=TRUE),
"Median"=median(Road_accident$Total_Number_of_Road_Accident ,na.rm = TRUE),
"Q3"=quantile(Road_accident$Total_Number_of_Road_Accident ,probs = .75,na.rm=TRUE),
"Max"=max(Road_accident$Total_Number_of_Road_Accident ,na.rm = TRUE),
"Mean"=mean(Road_accident$Total_Number_of_Road_Accident ,na.rm = TRUE),
"SD"=sd(Road_accident$Total_Number_of_Road_Accident ,na.rm = TRUE),
n=n()
)-> t3
knitr::kable(t3)
Min Q1 Median Q3 Max Mean SD n
391449 432246.2 479934 488669 501423 461236.4 37479.06 18

Decsriptive Statistics Cont.

Road_accident %>% summarise(
"Lobound"=(quantile(Road_accident$Total_Number_of_Road_Accident ,probs = .25) 
           -1.5*IQR(Road_accident$Total_Number_of_Road_Accident)),
"NumLoOuts"=sum(Road_accident$Total_Number_of_Road_Accident < Lobound)%>%round(0),
"SD_Min2Mean"=((mean(Road_accident$Total_Number_of_Road_Accident)
                -min(Road_accident$Total_Number_of_Road_Accident))
               /sd(Road_accident$Total_Number_of_Road_Accident)),
"Upbound"=(quantile(Road_accident$Total_Number_of_Road_Accident ,probs = .75) 
           +1.5*IQR(Road_accident$Total_Number_of_Road_Accident)),
"NumUpOuts"=sum(Road_accident$Total_Number_of_Road_Accident > Upbound)%>%round(0),
"Mean"=mean(Road_accident$Total_Number_of_Road_Accident ,na.rm = TRUE),
"SD_Mean2Max"=((max(Road_accident$Total_Number_of_Road_Accident)
                -mean(Road_accident&Total_Number_of_Road_Accident))
               /sd(Road_accident$Total_Number_of_Road_Accident)),
n=n()
)->t4
knitr::kable(t4)
Lobound NumLoOuts SD_Min2Mean Upbound NumUpOuts Mean SD_Mean2Max n
347612.1 0 1.862038 573303.1 0 461236.4 NA 18

Decsriptive Statistics Cont.

boxplot(
(Road_accident$Population_of_India)/2.7,
(Road_accident$Total_Number_of_Road_Accident),
ylab="Population of India",
xlab="Population of India and Total_Number_of_Road_Accident"
)
axis(1, at=1:2,labels = c("Population_of_India","Total_Number_of_Road_Accident"))

Decsriptive Statistics Cont.

t.test(Road_accident$Population_of_India,
 Road_accident$Total_Number_of_Road_Accident,
paired = TRUE,
alternative = "two.side",
conf.level = .95
)
## 
##  Paired t-test
## 
## data:  Road_accident$Population_of_India and Road_accident$Total_Number_of_Road_Accident
## t = 50.514, df = 17, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  661942.3 719647.4
## sample estimates:
## mean of the differences 
##                690794.9

Decsriptive Statistics Cont.(Linear Regression Model)

A<-Road_accident$Population_of_India^2
B<-(Road_accident$Total_Number_of_Road_Accident)^2
AB<-Road_accident$Population_of_India*(Road_accident$Total_Number_of_Road_Accident)
sum_A <-sum(Road_accident$Total_Number_of_Road_Accident)
sum_B <-sum(Road_accident$Population_of_India)
sum_A_sq <-sum(Road_accident$Total_Number_of_Road_Accident)^2
sum_B_sq <-sum(Road_accident$Population_of_India)^2
sum_AB<-sum(Road_accident$Population_of_India*(Road_accident$Total_Number_of_Road_Accident))
n<-length(Road_accident$Total_Number_of_Road_Accident)
LAA<-sum_A_sq-((sum_A^2)/n)
LBB<-sum_B_sq-((sum_B^2)/n)
LAB=sum_AB-((sum_A)*(sum_B)/n)
b=LAB/LAA
a=mean((Road_accident$Total_Number_of_Road_Accident)-b*mean(Road_accident$Population_of_India))
plot((Road_accident$Total_Number_of_Road_Accident)~Road_accident$Population_of_India,data=Road_accident)
abline(a=a, b=b, col="Blue")
abline(lm((Road_accident$Total_Number_of_Road_Accident)~Road_accident$Population_of_India))

model1<-lm(Population_of_India~Total_Number_of_Road_Accident, data = Road_accident)
model1%>%summary()
## 
## Call:
## lm(formula = Population_of_India ~ Total_Number_of_Road_Accident, 
##     data = Road_accident)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -58156 -33497  -7503  21035 124491 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   2.632e+05  1.434e+05   1.836   0.0851 .  
## Total_Number_of_Road_Accident 1.927e+00  3.100e-01   6.217 1.23e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47900 on 16 degrees of freedom
## Multiple R-squared:  0.7072, Adjusted R-squared:  0.6889 
## F-statistic: 38.65 on 1 and 16 DF,  p-value: 1.233e-05
R2<-(b*LAB)/LBB
R2
## [1] 8.009119e-08

Hypothesis Testing

pf(q=38.65, 1, 16,lower.tail = FALSE)
## [1] 1.233201e-05
model1 %>% anova()

Hence we reject \(H_0\), which means the data fits a linear regression model.

Hypthesis Testing Cont.

model1%>%summary()%>%coef()
##                                   Estimate   Std. Error  t value
## (Intercept)                   2.632418e+05 1.434104e+05 1.835584
## Total_Number_of_Road_Accident 1.926972e+00 3.099610e-01 6.216819
##                                   Pr(>|t|)
## (Intercept)                   8.507555e-02
## Total_Number_of_Road_Accident 1.233415e-05

\[H_A: \mu \ne 0 \]

This hypothesis statistics gives below result : \[ t = \ 6.216819 \] \[ p < \ .001\] The constant is statistically significant at the 0.05 level. This means there is statistically significant evidence that the constant is not 0.

Hypthesis Testing Cont.

plot(model1)

Discussion

References