Winequality_Red<-read.csv("C:/Users/Pallavi/Desktop/MSBA Aug22/Stat Methods/Wine_Project/winequality-red.csv")

mu_density<-mean(Winequality_Red$density,na.rm = TRUE)


#Q1(a) Provide an estimate of mu based on the sample;

mu_density
## [1] 0.9967467
#The mean(mu) = 0.9967467

# This is sample mean of 1599 red wines is an estimate mean of 
# population mean of all the wines from purtugal.

#Q1(b) How to quantify the variability (uncertainty) of the estimate?
# Central Limit Theoram  - N(mu,sigma sq/n)
# mu.hat ~ N(mu,sigma sq) where mu.hat = sample mean random variable, 
# mu = Polpulation mean of unknown parameter

sd_density<-sd(Winequality_Red$density)
sd_density # 0.001887334
## [1] 0.001887334
# Since our wine sample belongs to same population of purugal 
# then population mean x bar will follow same sample mean. 
# and x bar ~ N(mu,sigma sq/n)

#Based on the sample estimate, the mu value is very close to one, 
#and the variability is close to zero, so the Central Limit Theorem 
#can be used as the distribution appears to be normal. 
#We can also obersve the pattern of curve through the histogram below as 
#it shows a normal distribution for the sample of density.

hist(Winequality_Red$density)

sd.mu.hat<-sd(Winequality_Red$density)/sqrt(length(Winequality_Red$density))
sd.mu.hat   # 4.71981e-05
## [1] 4.71981e-05
# mu.hat ~ N(0.996(mu),sq(4.71981e-05) )
 
# Thus the estimate mu hat(random variable) will fall 
# in the interval (mu- 0.0000032,mu+0.0000032) with 95% chance.

#Q1(c) Use the CLT to give a 95% confidence interval for mu

#    Pr(mu.hat-0.0000032 <= mu <= mu.hat+0.0000032 )

# Calculating Lower CI 
lower_CI<-mu_density-2*(sd.mu.hat)
lower_CI  # 0.9966162
## [1] 0.9966523
# Calculating Upper CI 
upper_CI<-mu_density+2*(sd.mu.hat)
upper_CI  # 0.996805
## [1] 0.9968411
# In case of density the confidence interval will be (0.9966162,0.996805)
  
# Therefore we can say mu is in the interval (0.9966162,0.996805) 
# with 95% chance. Assuming mu as a fixed coin and CI (0.9966162,0.996805) 
# as a moving bucket, the moving bucket will cover this fixed coin with 95%
# probability. 


#Q1(d)Use the bootstrap method to do parts b and c, and compare 
#  the results with those obtained from the CLT. State your findings. 


mu_density.set <- NULL

for (k in 1:2000) {
  density.bootstrap <- sample(Winequality_Red$density, size=1599, replace=T)
  mu_density <- mean(density.bootstrap)
  mu_density.set[k] <- mu_density
}

sd(mu_density.set) # SD from bootstrap 4.644378e-05
## [1] 4.682341e-05
# Comparing 2 estimates from CLT and bootstrap 
#  sd from CLT = 4.71981e-05 , sd from bootstrap = 4.644378e-05

# We can see that sd from bootstrap is almost similar to CLT. 

CI_boostrap<-quantile(mu_density.set,probs= c(0.025,0.975)) 
CI_boostrap
##      2.5%     97.5% 
## 0.9966560 0.9968373
# CI from bootstrap @ 2.5% and 97.5% is 0.9966585 and 0.9968417 respectively.
# CI from CLT @ 2.5% and 97.5% is 0.9967396 and 0.9967538 respectively

# We can see that CI from bootstrap is almost similar to CLT.


#Q1(e) Can we use a normal distribution to model “density”? 
#   If yes, what are the maximum likelihood estimates of the mean
#    and standard deviation? Please provide their standard errors as well. 

hist(Winequality_Red$density)

# From the histogram it is evident that density function
# is normally distributed and we can use normal distribution to model"density".

# So now let's calculate max.likelyhood estimates of the mean and SD.
# Since density data values are in decimals so we will multiply 
#  these values by 10 and then divide estimates and sd by 10 to 
#  get precise MLE results.

# multiplying density variable by 10 and getting new density

density_no2<-Winequality_Red$density*10

# Checking the histogram for new density variable
hist(density_no2)

# Calculating density function for the 1st observation

dnorm(density_no2[1],mean=mean(density_no2),sd=sd(density_no2))
## [1] 18.08945
#[1] 18.08945

sd_density_no2<-sd(density_no2)

#[1] 0.01887334


minuslog.lik<-function(mu, sigma){
log.lik<-0
for(i in 1:1599){
log.lik<-log.lik+log(dnorm(density_no2[i], mean=mu,
sd=sigma))
}
return(-log.lik)
}

minuslog.lik(80,20)  #16062.54
## [1] 16062.54
minuslog.lik(100,30) # 14108.6
## [1] 14108.6
minuslog.lik(0,1)    # 80900.31
## [1] 80900.31
est <- mle(minuslog=minuslog.lik,
start=list(mu=mean(density_no2),
sigma=sd(density_no2)))

summary(est)
## Maximum likelihood estimation
## 
## Call:
## mle(minuslogl = minuslog.lik, start = list(mu = mean(density_no2), 
##     sigma = sd(density_no2)))
## 
## Coefficients:
##         Estimate   Std. Error
## mu    9.96746679 0.0004719810
## sigma 0.01887334 0.0003296881
## 
## -2 log L: -8159.31
# Maximum Likelyhood esimation

#Coefficients:
#         Estimate    Std. Error
#  mu    9.96746679   0.0004719810
#  sigma 0.01887334   0.0003296881

#   -2 log L: -8159.31

# Now since we get final values so now need to divide 
# estimates and sd by 10 to get precise MLE results.

#Coefficients new values after dividing by 10 can be seen below: 

#         Estimate    Std. Error
#  mu    0.9967       0.0001   # rounded to 4 decimal places
#  sigma 0.0019       0.0000   # rounded to 4 decimal places

# So max.likelyhood estimation of the mean is 0.9967 and of 
# the standard deviation is 0.0019. and std.error of mean is 0.0001 
# and standard deviation is 0.0000.(All rounded to four decimal places)




#Q2. Suppose the population mean of the variable “residual sugar” is mu , 
#    answer the following questions.

mu_residual_sugar<-mean(Winequality_Red$residual.sugar,na.rm = TRUE)
mu_residual_sugar<-mean(Winequality_Red$residual.sugar,na.rm=TRUE)



#Q2(a) Provide an estimate of mu based on the sample;
mu_residual_sugar
## [1] 2.538806
#The mean(mu) = 2.538806

#Q2(b)Noting that the sample distribution of “residual sugar” 
#   is highly skewed, can we use the CLT to quantify the variability
#   of your estimate? Can we use the CLT to give a 95% confidence
#   interval for mu? If yes, please give your solution. If no, explain why. 

hist(Winequality_Red$residual.sugar)

summary(Winequality_Red$residual.sugar)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.900   1.900   2.200   2.539   2.600  15.500
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#  0.900   1.900   2.200   2.539   2.600  15.500 

# From the histogram it is clear that distribution of residual 
# sugar is highly skewed towards right and from summary function it 
# is clear the distence bw 3rd qu and max difference is 12.9. 

# Now the Central limit theoram says mu.hat = N(mu,sq sd)
#  "The estimate mu.hat,as a random variable is unbiased i.e.E(mu.hat)=mu" 
#  "The estimate mu.hat,as a random variable is distributed 
#   symetrically around unknown parameter(mu)" 
#  "And estimate mu.hat,as a random variable, will fall in the interval
#   (mu-sd.hat,mu+sd.hat) with 95% chance. and parameter mu will 
#    follow the normal distribution N(mu,sd.mu.hat) as     
#    per Central Limit Theoram."

#  So as per above theoram although residual sugar data is highly  
#  skewed,but means of random variable will be noramlly distributed 
#  since sample size is big (1599) That's why we can use the 
#  CLT to quantify the variability of our estimate

#Using the CLT to give a 95% confidence interval for mu

sd_residual_sugar<-sd(Winequality_Red$residual.sugar)/sqrt(length(Winequality_Red$residual.sugar))
sd_residual_sugar   # 0.03525922
## [1] 0.03525922
# Calculating Lower CI 

lower_CI_residual_sugar<-mu_residual_sugar-2*(sd_residual_sugar)
lower_CI_residual_sugar  # 2.468287
## [1] 2.468287
# Calculating Upper CI 

upper_CI_residual_sugar<-mu_residual_sugar+2*(sd_residual_sugar)
upper_CI_residual_sugar  # 2.609324
## [1] 2.609324
# As evident from above CI, we are able to use the CLT method to 
# give a 95% confidence interval for the sample of mu. The solution is: (2.4683, 2.6093).


# In case of density the confidence interval will be (0.9966162,0.996805)
  

#Q2(c)Use the bootstrap method to do part b. Is the bootstrap confidence interval
#     symmetric? (hint: check the bootstrap distribution; see p. 43 in Lecture 3). 



mu_residual_sugar.set <- NULL

for (k in 1:2000) {
  residual_sugar.bootstrap <- sample(Winequality_Red$residual.sugar, size=1599, replace=T)
  mu_residual_sugar <- mean(residual_sugar.bootstrap)
  mu_residual_sugar.set[k] <- mu_residual_sugar
}

sd(mu_residual_sugar.set) # SD from bootstrap 0.03519252
## [1] 0.03493251
# Comparing 2 estimates from CLT and bootstrap 
#  sd from CLT = 0.03525922 , sd from bootstrap = 0.03519252

# We can see that sd(variablity) from bootstrap is almost similar to CLT. 

CI_boostrap<-quantile(mu_residual_sugar.set,probs= c(0.025,0.975)) 
CI_boostrap
##     2.5%    97.5% 
## 2.473698 2.610956
# CI from bootstrap @ 2.5% and 97.5% is 2.471883 and 2.608983  respectively.
# CI from CLT @ 2.5% and 97.5% is 2.468287 and 2.609324 respectively

# CI from bootstrap is almost similar to CLT

# Checking CI of bootstrap is symmetric or not through histograme.

hist(mu_residual_sugar.set,lwd=5,freq = FALSE)
lines(density(mu_residual_sugar.set),lwd=5,col='red')

# We can see from above histogram obtained from bootstrap that the 
# density curve is most likely symmetric and normally distributed.


#Q2(d)Can we use a normal distribution to model “residual sugar”? If no, what
#       distribution do you think can approximate its empirical distribution? What
#       parameters are needed to characterize such a distribution? what are their
#       maximum likelihood estimates? Please provide their standard errors as well. 


# No we can not use normal distribution to model " residual sugar" because
# as evident from  histogram of origional data of residual sugar, it is highly
# right side skewed.  Having said  that a log-normal can approximate its empirical 
# distribution because A log-normal distribution is a continuous distribution of
# random variable whose natural logarithm is normally distributed.
#The Log Normal Distribution is described as, “Density, distribution function, 
# quantile function and random generation for the log normal distribution whose 
# logarithm has mean equal  to meanlog and standard deviation equal to sdlog.”
# 
# The parameters mu(mean) and sigma(sd) are two parameters of the logarithm.  
#  dlnorm ->  gives the density of the function, 
#  plnorm ->  gives the distribution function, 
#  qlnorm ->  gives the quantile function, 
#  rlnorm ->  generates random deviates.

# So now let's calculate max.likelyhood estimates of the mean and SD.

# Calculating density function for the 1st observation

residual_sugar<-Winequality_Red$residual.sugar


dnorm(residual_sugar[1],mean=mean(residual_sugar),sd=sd(residual_sugar))
## [1] 0.2553509
#[1] 0.2553509

sd_residual_sugar<-sd(residual_sugar)
sd_residual_sugar
## [1] 1.409928
#[1]  1.409928


minuslog.lik<-function(mu, sigma){
log.lik<-0
for(i in 1:1599){
log.lik<-log.lik+log(dlnorm(residual_sugar[i], meanlog=mu,
sdlog=sigma))
}
return(-log.lik)
}

minuslog.lik(80,20)  #20140.88
## [1] 20140.88
minuslog.lik(100,30) #17000.45
## [1] 17000.45
minuslog.lik(0,1)    #3508.942
## [1] 3508.942
est <- mle(minuslog=minuslog.lik,
start=list(mu=log(mean(residual_sugar)),
sigma=log(sd(residual_sugar))))

summary(est)
## Maximum likelihood estimation
## 
## Call:
## mle(minuslogl = minuslog.lik, start = list(mu = log(mean(residual_sugar)), 
##     sigma = log(sd(residual_sugar))))
## 
## Coefficients:
##        Estimate  Std. Error
## mu    0.8502341 0.008935942
## sigma 0.3573260 0.006318293
## 
## -2 log L: 3965.773
# Maximum Likelyhood esimation

# Coefficients:
#        Estimate  Std. Error
#  mu    0.8502341 0.008935942
#  sigma 0.3573260 0.006318293

# -2 log L: 3965.773 

# So max.likelyhood estimation of the mean is 0.8502 and of the standard 
# deviation is 0.3573. and std.error of mean is 0.0089 and standar deviation 
# is 0.0063.(All rounded to four decimal places)





#Q3. We classify those wines as “excellent” if their rating is at least 7. 
# Suppose the population proportion of excellent wines is p. Do the following:

#Q3(a) Use the CLT to derive a 95% confidence interval for p;
  
#  First,we will create a binary variable and categorize values as per 
#  either one or zero based on their quality; 
#   If the wine’s rating is at least 7  ->  The value will be 1 
#   If the wine’s rating is less than 7 ->  The value will be 0 
# We will use The Bernoulli distribution in this case."The Bernoulli 
# distribution models any data generating process that can be thought of 
# as tossing a coin with a probability p of seeing the head. We say that
# a random variable X follows a Bernoulli distribution if:

# • It only has two possible outcomes 0 and 1 (no / yes); and
# • Its probabilities satisfy
#       Pr(X = 1) = p = 1 - Pr(X = 0) = 1 - q"

# Creating binary variables

wine_excellent<- as.numeric(Winequality_Red$quality >= 7) 


# Sample proportion

p_hat<-mean(wine_excellent)

p_hat   #0.1357098
## [1] 0.1357098
# Standard deviation of sample proportion

sd_excellent<-sqrt(p_hat*(1-p_hat)/length(wine_excellent))



sd_excellent #0.008564681
## [1] 0.008564681
# Lower CI for p

lower_CI_p <- p_hat - 2*(sd_excellent)

lower_CI_p   # 0.1185805 
## [1] 0.1185805
upper_CI_p<-p_hat + 2*(sd_excellent)

upper_CI_p   # 0.1528392
## [1] 0.1528392
# The lower CI for p is 0.1186 and upper CI for p is 0.1528 rounded 
# to 4 decimal points at 95% confidence interval. 

# As evident above, we are able to use the CLT to give a 95% confidence 
# interval for p. The solution is: (0.1186, 0.1528).

#3(b) Use the bootstrap method to derive a 95% confidence interval for p  

mu_p.set <- NULL

for (k in 1:2500) {
  p.bootstrap <- sample(wine_excellent, size=1599, replace=T)
  mu_p <- mean(p.bootstrap)
  mu_p.set[k] <- mu_p
}

sd(mu_p.set) # SD from bootstrap 0.008717535
## [1] 0.008566244
# Comparing 2 estimates from CLT and bootstrap 
#  sd from CLT = 0.008564681 , sd from bootstrap = 0.008717535

# We can see that sd from bootstrap is almost similar to CLT. 

CI_boostrap_p<-quantile(mu_p.set,probs= c(0.025,0.975)) 
CI_boostrap_p   
##      2.5%     97.5% 
## 0.1194497 0.1525954
# CI from bootstrap @ 2.5% and 97.5% is 0.1188243 and 0.1525954 respectively.


# Let's see the histograme of same

hist(mu_p.set,freq = FALSE)
lines(density(mu_p.set),lwd=5,col='red')

#Q3(c)Compare the two intervals. Is there any difference worth our attention? 

# CI from bootstrap @ 2.5% and 97.5% is 0.1188243 and 0.1525954 respectively.
# CI from CLT @ 2.5% and 97.5% is 0.1185805  and 0.1528392 respectively

# We can see that CI from bootstrap is almost similar to CLT.and there is 
# no difference worth our attention.

#Q3(d) What is the maximum likelihood estimate of p and its standard error? 

hist(wine_excellent)

minuslog.lik <- function(p) {
  log.lik <- 0
  for(i in 1:1599) {
    log.lik <- log.lik + log(dbinom(wine_excellent[i], size = 1, prob = p))
  }
  return(-log.lik)
}

est <- stats4::mle(minuslog = minuslog.lik,
                   start = list(p = p_hat))

summary(est)
## Maximum likelihood estimation
## 
## Call:
## stats4::mle(minuslogl = minuslog.lik, start = list(p = p_hat))
## 
## Coefficients:
##    Estimate  Std. Error
## p 0.1357098 0.008564278
## 
## -2 log L: 1269.921
#Maximum likelihood estimation

#Call:
#stats4::mle(minuslogl = minuslog.lik, start = list(p = p_hat))

#Coefficients:
#     Estimate  Std. Error
#  p 0.1357098 0.008564278

#  -2 log L: 1269.921 


#Because the above histogram portrays a binary variable, the histogram 
#only shows values of zero or one. From the above function, the estimate
#for the proportion of p is 0.1357 (rounded to four decimal places), 
#and its standard error is 0.0086 (rounded to four decimal places).