The project uses CARS data set which contains data about various aspects of common vehicles being manufactured and sold including their place of origin, manufacturing company, mileage, component specifications, etc. The objective of the project is to apply the concepts of statistical inference to the data set. Various methods such as ECDF, parametric and nonparametric bootstrap, MLE and Bayesian were used to infer the population statistics of the sample data set.
The dataset has 15 variables, of which, 6 are categorical and 9 are numerical. There are 428 observations in the dataset. URL: https://www.kaggle.com/ljanjughazyan/cars1#CARS.csv
library(readr)
library(Ecdat)
CARS <- read_csv("C:/Users/Swagatam/Downloads/CARS.csv")
attach(CARS)
# Emprical cdf of Length
#calculate the upper and lower bands vectors
##ecdf
len.ecdf<-ecdf(x = Length)
plot(len.ecdf,col="blue",main='Empirical CDF of Length')
#Confidence Band for Ecdf
Alpha=0.05
n=length(Length)
Eps=sqrt(log(2/Alpha)/(2*n))
grid<-seq(140,250, length.out = 1000)
lines(grid, pmin(len.ecdf(grid)+Eps,1),col="red")
lines(grid, pmax(len.ecdf(grid)-Eps,0),col="red")
This shows the 95% confidence band of ECDF plot
cor.sample <- cor(MPG_City, Length)
CarsBootSample <- CARS[,c("MPG_City", "Length")]
N <- dim(CarsBootSample)[1]
cor.boot <- replicate(3000, cor(CarsBootSample[sample(1:N, size = N, replace = TRUE),])[1,2])
sd.cor.boot <- sqrt(var(cor.boot))
sd.cor.boot
## [1] 0.03271003
cor.sample
## [1] -0.5015264
hist(cor.boot,col="pink", main='Histogram of Non Parametric Bootstrap samples')
normal.ci<-c(cor.sample-2*sd.cor.boot, cor.sample+2*sd.cor.boot)
normal.ci
## [1] -0.5669465 -0.4361064
pivotal.ci<-c(2*cor.sample-quantile(cor.boot,0.975), 2*cor.sample-quantile(cor.boot,0.025))
pivotal.ci
## 97.5% 2.5%
## -0.5641556 -0.4349655
quantile.ci<-quantile(cor.boot, c(0.025, 0.975))
quantile.ci
## 2.5% 97.5%
## -0.5680873 -0.4388972
# Assuming Length of honda and Audi
Ford=CARS$Length[CARS$Make=="Ford"]
Chevrolet=CARS$Length[CARS$Make=="Chevrolet"]
n.Ford=length(Ford)
n.Ford
## [1] 23
mu.Ford=mean(Ford)
sigma.Ford<-sd(Ford)
n.Chevrolet=length(Chevrolet)
n.Chevrolet
## [1] 27
mu.Chevrolet=mean(Chevrolet)
sigma.Chevrolet<-sd(Chevrolet)
mu_hat=mu.Ford-mu.Chevrolet
mu_hat
## [1] 0.2834138
sigma_hat<-sqrt(var(Ford)/n.Ford+var(Chevrolet)/n.Chevrolet)
sigma_hat
## [1] 5.098177
theta.hat<-function(s1, s2){
mean(s1)-mean(s2)
}
boot.theta.hat<-replicate(3200, theta.hat(rnorm(n.Ford, mean = mu.Ford, sd = sigma.Ford), rnorm(n.Chevrolet, mean = mu.Chevrolet, sd = sigma.Chevrolet)))
se<-sd(boot.theta.hat)
hist(boot.theta.hat,col="orange",main = 'Histogram of Parametric Bootstrap samples')
CI <- c(mu_hat-2*se, mu_hat+2*se)
se
## [1] 5.044365
CI
## [1] -9.805316 10.372143
z.stat <- mu_hat/se
p.value=2*(1-pnorm(abs(z.stat)))
z.stat
## [1] 0.05618425
p.value
## [1] 0.955195
mu.hp <- mean(Horsepower)
sd.hp <- sd(Horsepower)
mu.prior1 <- 200
sd.prior1 <- 1
Ib1 <- 1/sd.prior1
Ix <- length(Horsepower)/(sd.hp)^2
mu.posterior1 <- (mu.prior1*Ib1 + (mu.hp)*Ix)/(Ib1+Ix)
sd.posterior1 <- 1/(Ib1+Ix)
mu.prior1
## [1] 200
mu.prior2 <- 250
sd.prior2 <- 1500
Ib2 <- 1/sd.prior2
mu.posterior2 <- (mu.prior2*Ib2 + (mu.hp)*Ix)/(Ib2+Ix)
sd.posterior2 <- 1/(Ib2+Ix)
mu.prior2
## [1] 250
posterior1 <- rnorm(100,mean=mu.posterior1,sd=sd.posterior1)
posterior2 <- rnorm(100,mean=mu.posterior2,sd=sd.posterior2)
hist(posterior1, col=rgb(1,0,0,0.5),xlim=c(180,250), ylim=c(0,35),main="Overlapping Histogram for Posterior HP", xlab="Posterior HP", legend=T)
hist(posterior2, col=rgb(0,0,1,0.5), add=T)
box()