Exploration #1 - Weight

ggplot(data = MMs, aes(y = Weight, x = Type, color = Color)) +
  geom_point()+ 
  geom_boxplot(data = MMs, aes(x=Type, fill = Weight)) +
    geom_smooth(method = "lm", se = FALSE) +
  scale_color_manual(values = c("#0f6df2", "#996a05", "#2dbd4c", "#f79902", "#e61102", "#fffb00")) +
  labs(
    x = "Type", 
    y = "Weight", 
    title = "MMs - Weight by Type sorted by color")

fillc <- c("dodgerblue2","saddlebrown","green3","darkorange","red","yellow")
mms1 <- MMs %>% 
group_by(Type,Color) %>% 
summarise(count=n()) %>% 
mutate(prop=count/(sum(count)*100))
mms2 <- ggplot(mms1, aes(x = factor(Type), y = prop*100, fill = factor(Color))) +
geom_bar(stat="identity", width = 0.7) +
labs(x = "Type", y = "Proportion", fill = "Color", title="MMs - Color vs. Type") +
theme_minimal(base_size = 14) + scale_fill_manual(values=fillc) +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
mms2


Outlier Rule #1: Large > Q3 + 1.5(IQR) and Small < Q1 - 1.5(IQR)

Outlier Rule #2: Large > \(\bar{x}\) + 2s and Small < \(\bar{x}\) - 2s


Lower Fence = -0.875
Minimum Value = 0.50
No Small Outliers.
Upper Fence = 3.575
Maximum Value = 4.25 Large Outliers listed below.


The distribution of sample means, \(\bar{x}\), from samples size n is:
Shape: t-distributed, with n-1 degrees of freedom
Center: Mean = \(\mu_{\bar{x}} = \mu\)
Spread: Standard Deviation = \(\sigma_{\bar{x}}= \frac{s}{\sqrt{n}}\)

The confidence interval for a population mean \(\mu\) is:
\(\bar{x}\) - E < \(\mu\) < \(\bar{x}\) + E
where E = t\(_{CI,df}\cdot\frac{s}{\sqrt{n}}\), with n-1 degrees of freedom.


We are 95% confidence that the true mean weight of MMs is between 1.33 g and 1.43 g.

count <- 1035
tl <-qt(.025,count-1)
tu <- qt(.975,count-1)
dt_limit <- function(x) {
  y <- dt(x,5)
  y[x < tl | x > tu] <- NA
  return(y)
}
tf <- ggplot(data.frame(x = c(-5,5 )), aes(x = x))
tf1 <- tf + stat_function(fun = dt_limit, geom ="area", fill = "red", alpha = 0.2) + stat_function(fun = dnorm) +scale_x_continuous(name = "t") + scale_y_continuous(name = "Probability") + ggtitle("t distribution function curves of probabilities")
tf1

Exploration #2 - One Color Proportion

RMC <-filter(MMs,Color == "Red" & Type == "Milk_Chocolate")
totalRMC <- nrow(RMC)
totalRMC
## [1] 93
totalMMs <- nrow(MMs)
totalMMs
## [1] 1035
phat <- totalRMC/totalMMs
phat
## [1] 0.08985507
SE <- sqrt(phat*(1-phat)/totalMMs)
SE
## [1] 0.008889072
ME <- qnorm(.95,mean = 0, sd = 1)*SE
ME
## [1] 0.01462122
ci <- c(phat-ME,phat+ME)
ci
## [1] 0.07523385 0.10447629

The distribution of sample proportions, \(\hat{p}\), from samples size n is:
Shape: Approximately Normal
Center: Mean = \(\mu_{\hat{p}}\) = p
Spread: Standard Deviation = \(\sigma\) = \(\sqrt{\frac{p(1-p)}{n}}\)


ztest <- prop.test(x = 93, n = 1035, p = 0.13, alternative = "two.sided" , correct = FALSE)
ztest
## 
##  1-sample proportions test without continuity correction
## 
## data:  93 out of 1035, null probability 0.13
## X-squared = 14.748, df = 1, p-value = 0.0001229
## alternative hypothesis: true p is not equal to 0.13
## 95 percent confidence interval:
##  0.07391569 0.10882775
## sample estimates:
##          p 
## 0.08985507

Exploration #3 - Color Distribution of One Type

Type = Peanut Butter

PB <- filter(MMs, Type == "Peanut_Butter")
PB
## # A tibble: 107 x 3
##    Type          Color Weight
##    <chr>         <chr>  <dbl>
##  1 Peanut_Butter Blue    1.84
##  2 Peanut_Butter Blue    1.82
##  3 Peanut_Butter Blue    1.82
##  4 Peanut_Butter Blue    1.75
##  5 Peanut_Butter Blue    1.73
##  6 Peanut_Butter Blue    1.71
##  7 Peanut_Butter Blue    1.69
##  8 Peanut_Butter Blue    1.67
##  9 Peanut_Butter Blue    1.63
## 10 Peanut_Butter Blue    1.61
## # … with 97 more rows
exp <- c(0.10,0.20,0.20,0.20,0.20,0.10)
obsc <- table(PB$Color)
obsc
## 
##   Blue  Brown  Green Orange    Red Yellow 
##     15     20     19     16     21     16
expc <- exp*sum(obsc)
expc
## [1] 10.7 21.4 21.4 21.4 21.4 10.7
ctb <-((obsc-expc)^2)/expc
ctb
## 
##        Blue       Brown       Green      Orange         Red      Yellow 
## 1.728037383 0.091588785 0.269158879 1.362616822 0.007476636 2.625233645
chisq.test(obsc,p = exp)
## 
##  Chi-squared test for given probabilities
## 
## data:  obsc
## X-squared = 6.0841, df = 5, p-value = 0.2981

Milk Chocolate:
Red = 0.13, Orange = 0.20, Yellow = 0.13, Green = 0.20, Blue = 0.21, Brown = 0.13
Peanut & Peanut Butter
Red = 0.10, Orange = 0.20, Yellow = 0.20, Green = 0.20, Blue = 0.20, Brown = 0.10
Pretzel
Red = 0.14, Orange = 0.14, Yellow = 0.14, Green = 0.14, Blue = 0.30, Brown = 0.14
Crispy
Red = 0.14, Orange = 0.30, Yellow = 0.14, Green = 0.14, Blue = 0.14, Brown = 0.14


The distribution of one categorical variable is:
Shape: \(\chi^2\), where \(\chi^2 = \sum\frac{(Observed - Expected)^2}{Expected}\)
Degrees of Freedom: Number of Categories - 1


df1 <- 5
csqf <- ggplot(data.frame(x = c(-1,df1*3 )), aes(x = x)) +
       stat_function(fun = dchisq, args = list(df = df1))
csqf1 <- csqf + scale_x_continuous(name = "Chi-Squared") +
    scale_y_continuous(name = "Probability") +
    ggtitle("Chi-Squared function curves of probabilities")
csqf1

Exploration #4 - Homogenity of Color Distribution by Type

tbl <- table(MMs$Color,MMs$Type)
tbl
##         
##          Crispy Milk_Chocolate Peanut Peanut_Butter Pretzel
##   Blue       13            113     43            15      12
##   Brown      18             53     20            20      10
##   Green      18            106     23            19       5
##   Orange     10            112     57            16      19
##   Red        13             93     28            21       8
##   Yellow     10             82     44            16      18

The distribution of the relationship between two categorical variables is:
Shape: \(\chi^2\), where \(\chi^2 = \sum\frac{(Observed - Expected)^2}{Expected}\)
Degrees of Freedom: (Number of Variable 1 Categories - 1 )(Number of Variable 2 Categories - 1)


df1 <- 20
csqf <- ggplot(data.frame(x = c(-1,df1*2.5 )), aes(x = x)) +
       stat_function(fun = dchisq, args = list(df = df1)) +
    stat_function(fun = dchisq, args = list(df = df1/2)) +
    stat_function(fun = dchisq, args = list(df = df1/4))
csqf1 <- csqf + scale_x_continuous(name = "Chi-Squared") +
    scale_y_continuous(name = "Probability") +
    ggtitle("Chi-Squared function curves of probabilities")
csqf1