setwd("C:/Users/John Trygier/Documents/Schoolwork/Graduate/Fall 2021/Statistical Methods/Project/Part A/Data")
rw <- read.csv("winequality-red.csv")
summary(rw)
## fixed.acidity.volatile.acidity.citric.acid.residual.sugar.chlorides.free.sulfur.dioxide.total.sulfur.dioxide.density.pH.sulphates.alcohol.quality
## Length:1599
## Class :character
## Mode :character
my_list <- apply(rw, 1, strsplit, split = ";")
# unlist and turn it into a matrix
my_matrix <- matrix(as.numeric(unlist(my_list)), nrow = nrow(rw),
ncol =length(my_list[[1]][[1]]), byrow = TRUE)
rw.df <- as.data.frame(my_matrix)
colnames(rw.df) <- c("Fixed Acidity", "Volatile Acidity", "Citric Acid", "Residual Sugar", "Chlorides", "Free Sulfur Dioxide", "Total Sulfur Dioxide", "Density", "pH", "Sulphates", "Alcohol Volume", "Quality")
mu.hat<-mean(rw.df$Density)
mu.hat
## [1] 0.9967467
sd.mu.hat<-sd(rw.df$Density)/sqrt(length(rw.df$Density))
sd.mu.hat
## [1] 4.71981e-05
hist(rw.df$Density)
The above graphic visualizes the distribution of the “Density” variable. We can view the distribution of the data through the above histogram and the corresponding approximation of density across the dataset for the variable.
density(rw.df$Density)
##
## Call:
## density.default(x = rw.df$Density)
##
## Data: rw.df$Density (1599 obs.); Bandwidth 'bw' = 0.0003433
##
## x y
## Min. :0.9890 Min. : 0.01663
## 1st Qu.:0.9930 1st Qu.: 4.77847
## Median :0.9969 Median : 20.89383
## Mean :0.9969 Mean : 63.71438
## 3rd Qu.:1.0008 3rd Qu.:101.28509
## Max. :1.0047 Max. :242.79355
plot(density(rw.df$Density))
Above we see the distribution and visualization for the density of the “Density” variable.
sd(rw.df$Density)/sqrt(length(rw.df))
## [1] 0.0005448264
We use the above formula to calculate the standard error for “mu hat” for the “Density” variable. This is used to establish the confidence interval for the dataset.
mean(rw.df$Density) - 2*sd(rw.df$Density)/sqrt(length(rw.df))
## [1] 0.995657
mean(rw.df$Density) + 2*sd(rw.df$Density)/sqrt(length(rw.df))
## [1] 0.9978363
The 95% confidence interval for the “Density” variable is (0.9957, 0.9978).
mu.hat.set<-NULL ### Used to store mu.hat’s
for(k in 1:2000){ ### Repeat sampling 2000 times
sample<-rpois(n = 1599, lambda = mean(rw.df$Density))
mu.hat<-mean(sample)
mu.hat.set[k]<-mu.hat
}
mu.hat.set<-NULL ### Used to store mu.hat’s
for(k in 1:2000){ ### Repeat sampling 2000 times
sample.bootstrap<-sample(sample, size=1599,
replace=T)
mu.hat<-mean(sample.bootstrap)
mu.hat.set[k]<-mu.hat
}
sd(mu.hat.set)
## [1] 0.02474253
hist(mu.hat.set,freq = FALSE)
lines(density(mu.hat.set), lwd=5, col='blue')
c(mu.hat-2*sd.mu.hat, mu.hat+2*sd.mu.hat)
## [1] 0.9942771 0.9944659
Here we use the bootstrap method to create a sample using information from our original sample population. The confidence interval generated by the bootstrap method is (0.988, 0.988)
sd(mu.hat.set)
## [1] 0.02474253
mean(mu.hat.set)
## [1] 0.9741276
These are the metrics we can glean from our sample population, our sample standard deviation is 0.035, with a mean of 0.997.
hist(mu.hat.set,freq = FALSE)
lines(density(mu.hat.set), lwd=5, col='blue')
We can see that Density follows a normal distribution.
Yes We can.
library(stats4)
likelihood.fun<-function(mu){
dnorm(x=mean(rw.df$Density),mean=mu, sd=1)
}
minuslog.lik<-function(mu){ ###-log(likelihood)
-log(likelihood.fun(mu))
}
est <- mle(minuslog=minuslog.lik, start=list(mu=0))
summary(est)
## Maximum likelihood estimation
##
## Call:
## mle(minuslogl = minuslog.lik, start = list(mu = 0))
##
## Coefficients:
## Estimate Std. Error
## mu 0.9967467 1
##
## -2 log L: 1.837877
minuslog.lik<-function(mu, sigma){
log.lik<-0
for(i in 1:1599){
log.lik<-log.lik+log(dnorm(rw.df$Density[i]*10, mean=mu, sd=sigma))
}
return(-log.lik)
}
est1 <- mle(minuslog=minuslog.lik,
start=list(mu=mean(rw.df$`Residual Sugar`), sigma=sd(rw.df$`Residual Sugar`)))
summary(est1)
## Maximum likelihood estimation
##
## Call:
## mle(minuslogl = minuslog.lik, start = list(mu = mean(rw.df$`Residual Sugar`),
## sigma = sd(rw.df$`Residual Sugar`)))
##
## Coefficients:
## Estimate Std. Error
## mu 215.9203 25.2109
## sigma 1085.4575 NaN
##
## -2 log L: 25349.57
mean(rw.df$`Residual Sugar`)
## [1] 2.538806
mu <- mean(rw.df$`Residual Sugar`)
Yes. CLT is used to measure the distribution of the mean of data across samples, which ultimately becomes normally distributed for any set of data, no mater what the skew is of the original data.
mu.hat<-mean(rw.df$`Residual Sugar`)
mu.hat
## [1] 2.538806
sd.mu.hat<-sd(rw.df$`Residual Sugar`)/sqrt(length(rw.df$ `Residual Sugar`))
sd.mu.hat
## [1] 0.03525922
hist(rw.df$`Residual Sugar`)
mu.hat.set<-NULL ### Used to store mu.hat’s
for(k in 1:2000){ ### Repeat sampling 2000 times
sample<-rpois(n = 1599, lambda = mean(rw.df$`Residual Sugar`))
mu.hat<-mean(sample)
mu.hat.set[k]<-mu.hat
}
mu.hat.set<-NULL ### Used to store mu.hat’s
for(k in 1:2000){ ### Repeat sampling 2000 times
sample.bootstrap<-sample(sample, size=1599,
replace=T)
mu.hat<-mean(sample.bootstrap)
mu.hat.set[k]<-mu.hat
}
sd(mu.hat.set)
## [1] 0.04028049
hist(mu.hat.set,freq = FALSE)
lines(density(mu.hat.set), lwd=5, col='blue')
c(mu.hat-2*sd.mu.hat, mu.hat+2*sd.mu.hat)
## [1] 2.519225 2.660262
sd(mu.hat.set)
## [1] 0.04028049
mean(mu.hat.set)
## [1] 2.581158
hist(mu.hat.set,freq = FALSE)
lines(density(mu.hat.set), lwd=5, col='blue')
The Bootstrap confidence interval is symmetric, as demonstrated by the histogram & density lines in the graphic above.
hist(rw.df$`Residual Sugar`)
We cannot use a normal distribution to model “residual sugar”. We can use a lognormal distribution to approximate the empirical distribution of the data.
minuslog.lik<-function(mu, sigma){
log.lik<-0
for(i in 1:1599){
log.lik = log.lik + log(dlnorm(x=rw.df$`Residual Sugar`[i], meanlog = mu, sdlog = sigma))
}
return(-log.lik)
}
est.lognorm<-mle(minuslog=minuslog.lik, start=list(mu=log(mean(rw.df$`Residual Sugar`)),
sigma=log(sd(rw.df$`Residual Sugar`))))
summary(est.lognorm)
## Maximum likelihood estimation
##
## Call:
## mle(minuslogl = minuslog.lik, start = list(mu = log(mean(rw.df$`Residual Sugar`)),
## sigma = log(sd(rw.df$`Residual Sugar`))))
##
## Coefficients:
## Estimate Std. Error
## mu 0.8502341 0.008935942
## sigma 0.3573260 0.006318293
##
## -2 log L: 3965.773
We need the mean, standard deviation, and standard error of the data to characterize that distribution.
We’ll start by creating a new column in the dataset that indicates whether the wine is “excellent” or not.
rw.df$excellent <- ifelse(rw.df$Quality >= 7, 1, 0)
p = sum(rw.df$excellent)/nrow(rw.df)
sum(rw.df$excellent)/nrow(rw.df)
## [1] 0.1357098
We can see that approximately 13.6% of wines in the dataset can be classified as “excellent”.
mu.hat<-mean(rw.df$excellent)
mu.hat
## [1] 0.1357098
sd.mu.hat<-sd(rw.df$excellent)/sqrt(length(rw.df$excellent))
sd.mu.hat
## [1] 0.00856736
Now that we’ve established the standard error and the mean of the data, we can establish our 95% confidence interval.
mean(rw.df$excellent) + 2 * sd(rw.df$excellent)/sqrt(length(rw.df$excellent))
## [1] 0.1528445
mean(rw.df$excellent) - 2 * sd(rw.df$excellent)/sqrt(length(rw.df$excellent))
## [1] 0.1185751
Our 95% confidence interval is (0.118, 0.153)
mu.hat.set<-NULL ### Used to store mu.hat’s
for(k in 1:2000){ ### Repeat sampling 2000 times
sample<-rpois(n = 1599, lambda = mean(rw.df$excellent))
mu.hat<-mean(sample)
mu.hat.set[k]<-mu.hat
}
mu.hat.set<-NULL ### Used to store mu.hat’s
for(k in 1:2000){ ### Repeat sampling 2000 times
sample.bootstrap<-sample(sample, size=1599,
replace=T)
mu.hat<-mean(sample.bootstrap)
mu.hat.set[k]<-mu.hat
}
sd(mu.hat.set)
## [1] 0.009343369
hist(mu.hat.set,freq = FALSE)
lines(density(mu.hat.set), lwd=5, col='blue')
c(mu.hat-2*sd.mu.hat, mu.hat+2*sd.mu.hat)
## [1] 0.1267052 0.1609746
Above we see the distribution of our estimate mean. The bootstrap method yields a 95% confidence interval of (0.12, 0.16) vs. our CLT estimation of (0.128, 0.153). These intervals are very similar to one another.
library(stats4)
likelihood.fun<-function(mu){
dnorm(x=mean(rw.df$excellent),mean=mu, sd=1)
}
minuslog.lik<-function(mu){ ###-log(likelihood)
-log(likelihood.fun(mu))
}
est <- mle(minuslog=minuslog.lik, start=list(mu=0))
summary(est)
## Maximum likelihood estimation
##
## Call:
## mle(minuslogl = minuslog.lik, start = list(mu = 0))
##
## Coefficients:
## Estimate Std. Error
## mu 0.1357098 1
##
## -2 log L: 1.837877
minuslog.lik<-function(mu, sigma){
log.lik<-0
for(i in 1:1599){
log.lik<-log.lik+log(dnorm(rw.df$excellent[i]*10, mean=mu, sd=sigma))
}
return(-log.lik)
}
est1 <- mle(minuslog=minuslog.lik,
start=list(mu=mean(rw.df$excellent), sigma=sd(rw.df$excellent)))
summary(est1)
## Maximum likelihood estimation
##
## Call:
## mle(minuslogl = minuslog.lik, start = list(mu = mean(rw.df$excellent),
## sigma = sd(rw.df$excellent)))
##
## Coefficients:
## Estimate Std. Error
## mu 20.28288 15.82208
## sigma 621.55174 NaN
##
## -2 log L: 23510.53