Winequality_Red<-read.csv("C:/Users/Pallavi/Desktop/MSBA Aug22/Stat Methods/Wine_Project/winequality-red.csv")
mu_density<-mean(Winequality_Red$density,na.rm = TRUE)
#Q1(a) Provide an estimate of mu based on the sample;
mu_density
## [1] 0.9967467
#The mean(mu) = 0.9967467
# This is sample mean of 1599 red wines is an estimate mean of
# population mean of all the wines from purtugal.
#Q1(b) How to quantify the variability (uncertainty) of the estimate?
# Central Limit Theoram - N(mu,sigma sq/n)
# mu.hat ~ N(mu,sigma sq) where mu.hat = sample mean random variable,
# mu = Polpulation mean of unknown parameter
sd_density<-sd(Winequality_Red$density)
sd_density # 0.001887334
## [1] 0.001887334
# Since our wine sample belongs to same population of purugal
# then population mean x bar will follow same sample mean.
# and x bar ~ N(mu,sigma sq/n)
#Based on the sample estimate, the mu value is very close to one,
#and the variability is close to zero, so the Central Limit Theorem
#can be used as the distribution appears to be normal.
#We can also obersve the pattern of curve through the histogram below as
#it shows a normal distribution for the sample of density.
hist(Winequality_Red$density)

sd.mu.hat<-sd(Winequality_Red$density)/sqrt(length(Winequality_Red$density))
sd.mu.hat # 4.71981e-05
## [1] 4.71981e-05
# mu.hat ~ N(0.996(mu),sq(4.71981e-05) )
# Thus the estimate mu hat(random variable) will fall
# in the interval (mu- 0.0000032,mu+0.0000032) with 95% chance.
#Q1(c) Use the CLT to give a 95% confidence interval for mu
# Pr(mu.hat-0.0000032 <= mu <= mu.hat+0.0000032 )
# Calculating Lower CI
lower_CI<-mu_density-2*(sd.mu.hat)
lower_CI # 0.9966162
## [1] 0.9966523
# Calculating Upper CI
upper_CI<-mu_density+2*(sd.mu.hat)
upper_CI # 0.996805
## [1] 0.9968411
# In case of density the confidence interval will be (0.9966162,0.996805)
# Therefore we can say mu is in the interval (0.9966162,0.996805)
# with 95% chance. Assuming mu as a fixed coin and CI (0.9966162,0.996805)
# as a moving bucket, the moving bucket will cover this fixed coin with 95%
# probability.
#Q1(d)Use the bootstrap method to do parts b and c, and compare
# the results with those obtained from the CLT. State your findings.
mu_density.set <- NULL
for (k in 1:2000) {
density.bootstrap <- sample(Winequality_Red$density, size=1599, replace=T)
mu_density <- mean(density.bootstrap)
mu_density.set[k] <- mu_density
}
sd(mu_density.set) # SD from bootstrap 4.644378e-05
## [1] 4.682341e-05
# Comparing 2 estimates from CLT and bootstrap
# sd from CLT = 4.71981e-05 , sd from bootstrap = 4.644378e-05
# We can see that sd from bootstrap is almost similar to CLT.
CI_boostrap<-quantile(mu_density.set,probs= c(0.025,0.975))
CI_boostrap
## 2.5% 97.5%
## 0.9966560 0.9968373
# CI from bootstrap @ 2.5% and 97.5% is 0.9966585 and 0.9968417 respectively.
# CI from CLT @ 2.5% and 97.5% is 0.9967396 and 0.9967538 respectively
# We can see that CI from bootstrap is almost similar to CLT.
#Q1(e) Can we use a normal distribution to model “density”?
# If yes, what are the maximum likelihood estimates of the mean
# and standard deviation? Please provide their standard errors as well.
hist(Winequality_Red$density)

# From the histogram it is evident that density function
# is normally distributed and we can use normal distribution to model"density".
# So now let's calculate max.likelyhood estimates of the mean and SD.
# Since density data values are in decimals so we will multiply
# these values by 10 and then divide estimates and sd by 10 to
# get precise MLE results.
# multiplying density variable by 10 and getting new density
density_no2<-Winequality_Red$density*10
# Checking the histogram for new density variable
hist(density_no2)

# Calculating density function for the 1st observation
dnorm(density_no2[1],mean=mean(density_no2),sd=sd(density_no2))
## [1] 18.08945
#[1] 18.08945
sd_density_no2<-sd(density_no2)
#[1] 0.01887334
minuslog.lik<-function(mu, sigma){
log.lik<-0
for(i in 1:1599){
log.lik<-log.lik+log(dnorm(density_no2[i], mean=mu,
sd=sigma))
}
return(-log.lik)
}
minuslog.lik(80,20) #16062.54
## [1] 16062.54
minuslog.lik(100,30) # 14108.6
## [1] 14108.6
minuslog.lik(0,1) # 80900.31
## [1] 80900.31
est <- mle(minuslog=minuslog.lik,
start=list(mu=mean(density_no2),
sigma=sd(density_no2)))
summary(est)
## Maximum likelihood estimation
##
## Call:
## mle(minuslogl = minuslog.lik, start = list(mu = mean(density_no2),
## sigma = sd(density_no2)))
##
## Coefficients:
## Estimate Std. Error
## mu 9.96746679 0.0004719810
## sigma 0.01887334 0.0003296881
##
## -2 log L: -8159.31
# Maximum Likelyhood esimation
#Coefficients:
# Estimate Std. Error
# mu 9.96746679 0.0004719810
# sigma 0.01887334 0.0003296881
# -2 log L: -8159.31
# Now since we get final values so now need to divide
# estimates and sd by 10 to get precise MLE results.
#Coefficients new values after dividing by 10 can be seen below:
# Estimate Std. Error
# mu 0.9967 0.0001 # rounded to 4 decimal places
# sigma 0.0019 0.0000 # rounded to 4 decimal places
# So max.likelyhood estimation of the mean is 0.9967 and of
# the standard deviation is 0.0019. and std.error of mean is 0.0001
# and standard deviation is 0.0000.(All rounded to four decimal places)
#Q2. Suppose the population mean of the variable “residual sugar” is mu ,
# answer the following questions.
mu_residual_sugar<-mean(Winequality_Red$residual.sugar,na.rm = TRUE)
mu_residual_sugar<-mean(Winequality_Red$residual.sugar,na.rm=TRUE)
#Q2(a) Provide an estimate of mu based on the sample;
mu_residual_sugar
## [1] 2.538806
#The mean(mu) = 2.538806
#Q2(b)Noting that the sample distribution of “residual sugar”
# is highly skewed, can we use the CLT to quantify the variability
# of your estimate? Can we use the CLT to give a 95% confidence
# interval for mu? If yes, please give your solution. If no, explain why.
hist(Winequality_Red$residual.sugar)

summary(Winequality_Red$residual.sugar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.900 1.900 2.200 2.539 2.600 15.500
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.900 1.900 2.200 2.539 2.600 15.500
# From the histogram it is clear that distribution of residual
# sugar is highly skewed towards right and from summary function it
# is clear the distence bw 3rd qu and max difference is 12.9.
# Now the Central limit theoram says mu.hat = N(mu,sq sd)
# "The estimate mu.hat,as a random variable is unbiased i.e.E(mu.hat)=mu"
# "The estimate mu.hat,as a random variable is distributed
# symetrically around unknown parameter(mu)"
# "And estimate mu.hat,as a random variable, will fall in the interval
# (mu-sd.hat,mu+sd.hat) with 95% chance. and parameter mu will
# follow the normal distribution N(mu,sd.mu.hat) as
# per Central Limit Theoram."
# So as per above theoram although residual sugar data is highly
# skewed,but means of random variable will be noramlly distributed
# since sample size is big (1599) That's why we can use the
# CLT to quantify the variability of our estimate
#Using the CLT to give a 95% confidence interval for mu
sd_residual_sugar<-sd(Winequality_Red$residual.sugar)/sqrt(length(Winequality_Red$residual.sugar))
sd_residual_sugar # 0.03525922
## [1] 0.03525922
# Calculating Lower CI
lower_CI_residual_sugar<-mu_residual_sugar-2*(sd_residual_sugar)
lower_CI_residual_sugar # 2.468287
## [1] 2.468287
# Calculating Upper CI
upper_CI_residual_sugar<-mu_residual_sugar+2*(sd_residual_sugar)
upper_CI_residual_sugar # 2.609324
## [1] 2.609324
# As evident from above CI, we are able to use the CLT method to
# give a 95% confidence interval for the sample of mu. The solution is: (2.4683, 2.6093).
# In case of density the confidence interval will be (0.9966162,0.996805)
#Q2(c)Use the bootstrap method to do part b. Is the bootstrap confidence interval
# symmetric? (hint: check the bootstrap distribution; see p. 43 in Lecture 3).
mu_residual_sugar.set <- NULL
for (k in 1:2000) {
residual_sugar.bootstrap <- sample(Winequality_Red$residual.sugar, size=1599, replace=T)
mu_residual_sugar <- mean(residual_sugar.bootstrap)
mu_residual_sugar.set[k] <- mu_residual_sugar
}
sd(mu_residual_sugar.set) # SD from bootstrap 0.03519252
## [1] 0.03493251
# Comparing 2 estimates from CLT and bootstrap
# sd from CLT = 0.03525922 , sd from bootstrap = 0.03519252
# We can see that sd(variablity) from bootstrap is almost similar to CLT.
CI_boostrap<-quantile(mu_residual_sugar.set,probs= c(0.025,0.975))
CI_boostrap
## 2.5% 97.5%
## 2.473698 2.610956
# CI from bootstrap @ 2.5% and 97.5% is 2.471883 and 2.608983 respectively.
# CI from CLT @ 2.5% and 97.5% is 2.468287 and 2.609324 respectively
# CI from bootstrap is almost similar to CLT
# Checking CI of bootstrap is symmetric or not through histograme.
hist(mu_residual_sugar.set,lwd=5,freq = FALSE)
lines(density(mu_residual_sugar.set),lwd=5,col='red')

# We can see from above histogram obtained from bootstrap that the
# density curve is most likely symmetric and normally distributed.
#Q2(d)Can we use a normal distribution to model “residual sugar”? If no, what
# distribution do you think can approximate its empirical distribution? What
# parameters are needed to characterize such a distribution? what are their
# maximum likelihood estimates? Please provide their standard errors as well.
# No we can not use normal distribution to model " residual sugar" because
# as evident from histogram of origional data of residual sugar, it is highly
# right side skewed. Having said that a log-normal can approximate its empirical
# distribution because A log-normal distribution is a continuous distribution of
# random variable whose natural logarithm is normally distributed.
#The Log Normal Distribution is described as, “Density, distribution function,
# quantile function and random generation for the log normal distribution whose
# logarithm has mean equal to meanlog and standard deviation equal to sdlog.”
#
# The parameters mu(mean) and sigma(sd) are two parameters of the logarithm.
# dlnorm -> gives the density of the function,
# plnorm -> gives the distribution function,
# qlnorm -> gives the quantile function,
# rlnorm -> generates random deviates.
# So now let's calculate max.likelyhood estimates of the mean and SD.
# Calculating density function for the 1st observation
residual_sugar<-Winequality_Red$residual.sugar
dnorm(residual_sugar[1],mean=mean(residual_sugar),sd=sd(residual_sugar))
## [1] 0.2553509
#[1] 0.2553509
sd_residual_sugar<-sd(residual_sugar)
sd_residual_sugar
## [1] 1.409928
#[1] 1.409928
minuslog.lik<-function(mu, sigma){
log.lik<-0
for(i in 1:1599){
log.lik<-log.lik+log(dlnorm(residual_sugar[i], meanlog=mu,
sdlog=sigma))
}
return(-log.lik)
}
minuslog.lik(80,20) #20140.88
## [1] 20140.88
minuslog.lik(100,30) #17000.45
## [1] 17000.45
minuslog.lik(0,1) #3508.942
## [1] 3508.942
est <- mle(minuslog=minuslog.lik,
start=list(mu=log(mean(residual_sugar)),
sigma=log(sd(residual_sugar))))
summary(est)
## Maximum likelihood estimation
##
## Call:
## mle(minuslogl = minuslog.lik, start = list(mu = log(mean(residual_sugar)),
## sigma = log(sd(residual_sugar))))
##
## Coefficients:
## Estimate Std. Error
## mu 0.8502341 0.008935942
## sigma 0.3573260 0.006318293
##
## -2 log L: 3965.773
# Maximum Likelyhood esimation
# Coefficients:
# Estimate Std. Error
# mu 0.8502341 0.008935942
# sigma 0.3573260 0.006318293
# -2 log L: 3965.773
# So max.likelyhood estimation of the mean is 0.8502 and of the standard
# deviation is 0.3573. and std.error of mean is 0.0089 and standar deviation
# is 0.0063.(All rounded to four decimal places)
#Q3. We classify those wines as “excellent” if their rating is at least 7.
# Suppose the population proportion of excellent wines is p. Do the following:
#Q3(a) Use the CLT to derive a 95% confidence interval for p;
# First,we will create a binary variable and categorize values as per
# either one or zero based on their quality;
# If the wine’s rating is at least 7 -> The value will be 1
# If the wine’s rating is less than 7 -> The value will be 0
# We will use The Bernoulli distribution in this case."The Bernoulli
# distribution models any data generating process that can be thought of
# as tossing a coin with a probability p of seeing the head. We say that
# a random variable X follows a Bernoulli distribution if:
# • It only has two possible outcomes 0 and 1 (no / yes); and
# • Its probabilities satisfy
# Pr(X = 1) = p = 1 - Pr(X = 0) = 1 - q"
# Creating binary variables
wine_excellent<- as.numeric(Winequality_Red$quality >= 7)
# Sample proportion
p_hat<-mean(wine_excellent)
p_hat #0.1357098
## [1] 0.1357098
# Standard deviation of sample proportion
sd_excellent<-sqrt(p_hat*(1-p_hat)/length(wine_excellent))
sd_excellent #0.008564681
## [1] 0.008564681
# Lower CI for p
lower_CI_p <- p_hat - 2*(sd_excellent)
lower_CI_p # 0.1185805
## [1] 0.1185805
upper_CI_p<-p_hat + 2*(sd_excellent)
upper_CI_p # 0.1528392
## [1] 0.1528392
# The lower CI for p is 0.1186 and upper CI for p is 0.1528 rounded
# to 4 decimal points at 95% confidence interval.
# As evident above, we are able to use the CLT to give a 95% confidence
# interval for p. The solution is: (0.1186, 0.1528).
#3(b) Use the bootstrap method to derive a 95% confidence interval for p
mu_p.set <- NULL
for (k in 1:2500) {
p.bootstrap <- sample(wine_excellent, size=1599, replace=T)
mu_p <- mean(p.bootstrap)
mu_p.set[k] <- mu_p
}
sd(mu_p.set) # SD from bootstrap 0.008717535
## [1] 0.008566244
# Comparing 2 estimates from CLT and bootstrap
# sd from CLT = 0.008564681 , sd from bootstrap = 0.008717535
# We can see that sd from bootstrap is almost similar to CLT.
CI_boostrap_p<-quantile(mu_p.set,probs= c(0.025,0.975))
CI_boostrap_p
## 2.5% 97.5%
## 0.1194497 0.1525954
# CI from bootstrap @ 2.5% and 97.5% is 0.1188243 and 0.1525954 respectively.
# Let's see the histograme of same
hist(mu_p.set,freq = FALSE)
lines(density(mu_p.set),lwd=5,col='red')

#Q3(c)Compare the two intervals. Is there any difference worth our attention?
# CI from bootstrap @ 2.5% and 97.5% is 0.1188243 and 0.1525954 respectively.
# CI from CLT @ 2.5% and 97.5% is 0.1185805 and 0.1528392 respectively
# We can see that CI from bootstrap is almost similar to CLT.and there is
# no difference worth our attention.
#Q3(d) What is the maximum likelihood estimate of p and its standard error?
hist(wine_excellent)

minuslog.lik <- function(p) {
log.lik <- 0
for(i in 1:1599) {
log.lik <- log.lik + log(dbinom(wine_excellent[i], size = 1, prob = p))
}
return(-log.lik)
}
est <- stats4::mle(minuslog = minuslog.lik,
start = list(p = p_hat))
summary(est)
## Maximum likelihood estimation
##
## Call:
## stats4::mle(minuslogl = minuslog.lik, start = list(p = p_hat))
##
## Coefficients:
## Estimate Std. Error
## p 0.1357098 0.008564278
##
## -2 log L: 1269.921
#Maximum likelihood estimation
#Call:
#stats4::mle(minuslogl = minuslog.lik, start = list(p = p_hat))
#Coefficients:
# Estimate Std. Error
# p 0.1357098 0.008564278
# -2 log L: 1269.921
#Because the above histogram portrays a binary variable, the histogram
#only shows values of zero or one. From the above function, the estimate
#for the proportion of p is 0.1357 (rounded to four decimal places),
#and its standard error is 0.0086 (rounded to four decimal places).