rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 531902 28.5 1185087 63.3 NA 669265 35.8
## Vcells 983123 7.6 8388608 64.0 16384 1840436 14.1
cat("\f") # Clear the console
dev.off # Clear the charts
## function (which = dev.cur())
## {
## if (which == 1)
## stop("cannot shut down device 1 (the null device)")
## .External(C_devoff, as.integer(which))
## dev.cur()
## }
## <bytecode: 0x111bebe50>
## <environment: namespace:grDevices>
It would be very helpful if you could plot the distributions before calculating the probabilities. Begin with reading up on the plot() function.
These questions will help you build an understanding of Normal, Binomial, Hypergeometric and Poisson distribution.
You will be using the probability density function, cumulative density function and quantile function in this assignment.
i. Identify the distribution
This is a binomial distribution problem. The binomial distribution is used to describe the number of successes in a fixed number of trials. This is different from the geometric distribution, which described the number of trials we must wait before we observe a success.
Check its assumptions quickly.
ii. Identify the parameters
Given \(\pi = 50\% = .5\) is the true preference. Also, given \(n = 20\), the number of trails.
iii. See what is the Probability Statement Required to be Solved so that you know what to compute
Let X be the count of buyers who prefer red color.
In Words: Probability that between 9 and 12 (both inclusive) buyers would prefer red.
In Math : \(P( 9 \leq X \leq 12 | n=20, \pi =.5 )\)
iv. Compute/Evaluate
Helpful to plot the distribution - see Graphing Binomial Distribution in R
?plot
## Help on topic 'plot' was found in the following packages:
##
## Package Library
## graphics /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
## base /Library/Frameworks/R.framework/Resources/library
##
##
## Using the first match ...
# Generate binomial distribution data
x_values <- 0:20
y_values <- dbinom(x = x_values,
size = 20,
prob = 0.5
)
# Bare bones chart
plot(x = x_values,
y = y_values
)
An embellished chart with the area that we want as shaded -
# Set colors based on condition
colors <- ifelse(test = x_values <= 12 & x_values >= 9,
yes = "red",
no = "lightblue"
)
# Plot the binomial distribution with conditional coloring
plot(x = x_values,
y = y_values,
type = 'h',
main = 'Binomial Distribution (n=20, p=0.5)',
ylab = 'Probability',
xlab = '# Successes',
col = colors, # Specify colors
lwd = 3
)
Three methods to compute -
We will have to add up the probability mass function for the \(9^{th},10^{th},11^{th}\) and the \(12^{th}\) buyer.
?dbinom
dbinom(x = 0:20, size = 20,prob = .5)
## [1] 9.536743e-07 1.907349e-05 1.811981e-04 1.087189e-03 4.620552e-03
## [6] 1.478577e-02 3.696442e-02 7.392883e-02 1.201344e-01 1.601791e-01
## [11] 1.761971e-01 1.601791e-01 1.201344e-01 7.392883e-02 3.696442e-02
## [16] 1.478577e-02 4.620552e-03 1.087189e-03 1.811981e-04 1.907349e-05
## [21] 9.536743e-07
?options # Allow the user to set and examine a variety of global options which affect the way in which R computes and displays its results.
options(scipen=999) #prevent R from displaying numbers in scientific notation
# define range of successes
success <- 9:12
# display probability of success for each number of trials
dbinom(x = success, # or you can do x = 9:12
size = 20,
prob = .5
)
## [1] 0.1601791 0.1761971 0.1601791 0.1201344
# add all the probabilities
sum(dbinom(x = 9:12, size = 20, prob = .5))
## [1] 0.6166897
round(x = sum(dbinom(x = 9:12,
size = 20,
prob = .5)
),
digits = 4
) # if you want to get more fancy with rounding
## [1] 0.6167
.6167
Another way to do get the same answer - instead of adding pmfs, subtract cdf of 8 consumers from cdf of 12 consumers.
Plot the charts -
# Set up a grid layout with 1 row and 2 columns
par(mfrow = c(1, 2))
# Generate binomial distribution data
x <- 0:20
y <- dbinom(x, size = 20, prob = 0.5)
# Plot the first chart with x < 8 as red bars
plot(x = x,
y = y,
type = 'h',
main = 'Binomial Distribution (n=20, p=0.5)',
ylab = 'Probability',
xlab = '# Successes',
col = ifelse(test = x < 8,
yes = "red",
no = "black"), # Color bars where x < 8 as red
lwd = 3
)
# Plot the second chart with x < 12 as light blue bars
plot(x = x,
y = y,
type = 'h',
main = 'Binomial Distribution (n=20, p=0.5)',
ylab = 'Probability',
xlab = '# Successes',
col = ifelse(test = x < 12,
yes = "lightblue",
no = "black"), # Color bars where x < 12 as light blue
lwd = 3
)
Compute the probability -
?pbinom
pbinom(q = 12,
size = 20,
prob = .5,
lower.tail = TRUE # lower.tail logical; if TRUE (default), probabilities are P[X≤x] = 12
) -
pbinom(q = 8,
size = 20,
prob = .5,
lower.tail = TRUE # lower.tail logical; if TRUE (default), probabilities are P[X≤x] = 12
)
## [1] 0.6166897
Note due to discrete nature of the distribution, we have to adjust for \(q=9\) to \(q=9-1=8\).
# lower.tail = FALSE logical; P[X>x].
pbinom(q = 8,
size = 20,
prob = .5,
lower.tail = FALSE) -
pbinom(q = 12,
size = 20,
prob = .5,
lower.tail = FALSE
)
## [1] 0.6166897
# Set up a grid layout with 1 row and 2 columns
par(mfrow = c(1, 2))
# Plot the first chart with x > 8 as red bars
plot(x = x,
y = y,
type = 'h',
main = 'Binomial Distribution (n=20, p=0.5)',
ylab = 'Probability',
xlab = '# Successes',
col = ifelse(x > 8, "red", "black"), # Color bars where x > 8 as red
lwd = 3
)
# Plot the second chart with x > 12 as light blue bars
plot(x = x,
y = y,
type = 'h',
main = 'Binomial Distribution (n=20, p=0.5)',
ylab = 'Probability',
xlab = '# Successes',
col = ifelse(x > 12, "lightblue", "black"), # Color bars where x > 12 as light blue
lwd = 3
)
Round your answer to four decimal places.
i. Identify the distribution
Again, this is a binomial distribution problem. The binomial distribution is used to describe the number of successes in a fixed number of trials, and here the success is defined as bulb being defective. If that helps, you can think of the bulb being “special” and not necessarily bad/“failure” from a manufacturing perspective.
ii. Identify the parameters
\(n=13\)
\(\pi = 20\% =.2\)
iii. See what is the Probability Statement Required to be Solved so that you know what to compute
Let X be the count of defective bulbs.
In Words : Probability that less than 6 but more than 3 bulbs from the sample are defective.
In Math: \(P(3<X<6 | n=13, \pi =.2 )\)
iv. Compute/Evaluate
?plot
## Help on topic 'plot' was found in the following packages:
##
## Package Library
## graphics /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
## base /Library/Frameworks/R.framework/Resources/library
##
##
## Using the first match ...
plot(x = 0:13, # x variable
y = dbinom(x = 0:13, # y variable
size = 13,
prob = .2
),
main = 'Binomial Distribution (n=13, p=0.2)', # title
ylab = 'Probability', # y axis label
xlab = '# Successes', # x axis label
type = 'h', # "h" for histogram-like vertical lines. Can try "l" if continuous distribution, or ignore
lwd = 6, # line widths, can be ignored
col = ifelse(test = x >= 4 & x < 6,
yes = "red",
no = "lightblue"
)
)
dbinom(x = 4:5, size = 13, prob = .2) # list probabilities
## [1] 0.15354508 0.06909529
sum(dbinom(x = 4:5, size = 13, prob = .2)) # sum probabilities
## [1] 0.2226404
round(x = sum(dbinom(x = 4:5,
size = 13,
prob = .2
)
),
digits = 4
)
## [1] 0.2226
0.2226
# LOWER TAIL (default)
pbinom(q = 5,
size = 13,
prob = .2) -
pbinom(q = 3,
size = 13,
prob = .2)
## [1] 0.2226404
# UPPER TAIL (default)
pbinom(q = 3,
size = 13,
prob = .2,
lower.tail = FALSE
) -
pbinom(q = 5,
size = 13,
prob = .2,
lower.tail = FALSE)
## [1] 0.2226404
i. Identify the distribution
This is a Poisson probability distribution problem. Poisson distribution describes the number of times some event occurs during a specified interval. The interval may be time, distance, area, or volume. The Poisson is actually an approximation of the binomial distribution for very rare events.
The Poisson distribution is often useful for estimating the number of events in a large population over a unit of time. For instance, consider each of the following events: having a heart attack, getting married, and getting struck by lightning.
ii. Identify the parameters
\(\lambda=4.2\)
iii. See what is the Probability Statement Required to be Solved so that you know what to compute
Let X be the count of special orders sent out.
In Words : Probability that, for any day, the number of special orders sent out will be no more than 3?
In Math : \(P (X \leq 3 |\lambda=4.2)\)
iv. Compute/Evaluate
Helpful to plot the distribution - see Graphing Poisson Distribution in R
range <- 0:10 # choose a large number arbitrarily for range of successes
plot(x = range,
y = dpois(x = range,
lambda = 4.2
),
main = 'Poisson Distribution (lambda = 4.2)',
ylab = 'Probability',
xlab = '# Successes',
type = 'h',
lwd = 3,
col = ifelse(test = x <= 3,
yes = "red",
no = "lightblue"
)
)
dpois(x = range, lambda=4.2)
## [1] 0.014995577 0.062981423 0.132260988 0.185165383 0.194423652 0.163315867
## [7] 0.114321107 0.068592664 0.036011149 0.016805203 0.007058185
sum(dpois(x = 0:3,
lambda = 4.2)
) # Brute force - adding pdfs for every discrete event
## [1] 0.3954034
ppois(q = 3,
lambda = 4.2,
lower.tail = T
) # alternatively, just use the cdf (area upto the value)
## [1] 0.3954034
1 - ppois(q = 3,
lambda = 4.2,
lower.tail = F
) # also works for complements
## [1] 0.3954034
0.3954
i. Identify the distribution
This is a Hyper geometric distribution problem. Its properties include -
ii. Identify the parameters.
Let X be the number of contaminated bottles in sample. Thus, note contamination is ‘success’ here.. Recall - + N is population size: 17 # used in calculation of S and F, thus included in parameterization… + n is sample size: 3 + S is number of successes in population: 6 + Thus, F is the number of failure in population, N-S = 17-6 = 11 + x is number of successes in sample: less than 2
iii. See what is the Probability Statement Required to be Solved so that you know what to compute
Probability that less than 2 of the tested bottles are contaminated.
\(P(X<2 \ | \ S \ (Success \ in \ Population) = 6, F \ (Failure \ in \ Population) = N - S = 17 - 6 = 11, n \ (sample \ size)= 3 \ )\)
Note we have used the the info that \(N = 17\).
iv. Compute/Evaluate
Helpful to plot the distribution - see Graphing Hyper Geometric Distribution in R
# Specify x-values for dhyper function
?seq()
x_dhyper <- 0:17
# Apply dhyper function
?dhyper
y_dhyper <- dhyper(x = x_dhyper, # success in sample / event of interest // the number of white balls drawn without replacement from an urn which contains both black and white balls.
m = 6, # Success in Population // the number of white balls in the urn.
n = 17 - 6, # Failure in Population // the number of black balls in the urn.
k = 3 # sample size // the number of balls drawn from the urn, hence must be in 0,1,...m+n
)
# Plot dhyper values
plot(x = x_dhyper,
y = y_dhyper,
type = 'h',
main = 'Hypergeometric Distribution (S = 6, F = 11, n = 3)',
ylab = 'Probability',
xlab = '# Successes',
lwd = 3,
col = ifelse(test = x <= 1, # less than 2 is 1 or less
yes = "red",
no = "lightblue"
)
)
Note PMF falls to 0 after \(X>3\), as expected.
The prob of \(Prob(X=0)=0.24264,
Prob(X=1)=0.485, Prob(X=2)=0.24264, Prob(X=3)=0.02941176\).
dhyper(x = x_dhyper, # success in sample / event of interest
m = 6, # Success in Population
n = 17-6, # Failure in Population
k = 3 # sample size
)
## [1] 0.24264706 0.48529412 0.24264706 0.02941176 0.00000000 0.00000000
## [7] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## [13] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
?dhyper
# Brute Force: P(x = event of interest | m = Success in pop , n = Failure in pop , k = sample size)
sum(dhyper(x = 0:1, # success in sample / event of interest
m = 6, # Success in Population
n = 17-6, # Failure in Population
k = 3 # sample size
)
)
## [1] 0.7279412
# standard cdf
phyper(q = 1,
m = 6,
n = 17-6,
k = 3,
lower.tail = T)
## [1] 0.7279412
# cdf (upper tail)/complement of cdf
1 - phyper(q = 1,
m=6,
n=17-6,
k=3,
lower.tail = F)
## [1] 0.7279412
0.7279
i. Identify the distribution
This is a Hyper geometric distribution problem.
ii. Identify the parameters
Let X be count of employee was over 50. Note employee over 50 is ‘success’ here..
Recall - + N is population size: 25 # used in calculation of S and F, thus included in parametrization… \(19 + 6 =25\) + n is sample size: 6 + S is number of successes in population: 6 + Thus, F is the number of failure in population, N-S = 25-6 = 19 + x is number of successes in sample: 1
iii. See what is the Probability Statement Required to be Solved so that you know what to compute
Let X be the count of employees over 50.
\(P(X>1 | S = 6, F = 19, n = 6)\)
Note \(N = 25\) already used in the probability statement above (in the F argument).
iv. Compute/Evaluate
# Specify x-values for dhyper function
?seq()
x_dhyper <- seq(from = 0,
to = 25,
by = 1
)
# Apply dhyper function
?dhyper
y_dhyper <- dhyper(x = x_dhyper, # success in sample / event of interest
m = 6, # Success in Population
n = 19, # Failure in Population
k = 6. # sample size
)
# Plot dhyper values
plot(x = x_dhyper,
y = y_dhyper,
type = 'h',
main = 'Hypergeometric Distribution (S = 6, F = 19, n = 6)',
ylab = 'Probability',
xlab = '# Successes',
lwd = 3,
col = ifelse(test = x > 1,
yes = "red",
no = "lightblue"
)
)
Again, the probability falls down to zero when \(x>n\) in dhyper arguments.
dhyper(x = 0:25,m = 6,n = 19,k = 6)
## [1] 0.153201581028 0.393946922643 0.328289102202 0.109429700734 0.014483342744
## [6] 0.000643704122 0.000005646527 0.000000000000 0.000000000000 0.000000000000
## [11] 0.000000000000 0.000000000000 0.000000000000 0.000000000000 0.000000000000
## [16] 0.000000000000 0.000000000000 0.000000000000 0.000000000000 0.000000000000
## [21] 0.000000000000 0.000000000000 0.000000000000 0.000000000000 0.000000000000
## [26] 0.000000000000
# Brute Force: P(x = 2,3,4,5,6 | m = Success in pop , n = Failure in pop , k = sample size)
sum(dhyper(x = 2:6, # success in sample / event of interest
m = 6, # Success in Population
n = 19, # Failure in Population
k = 6 # Sample Size
)
)
## [1] 0.4528515
# standard cdf / lower.tail = T, but Question asks for 2 or more people in the sample...thus subtract 1
1 - phyper(q = 1,
m = 6,
n = 19,
k = 6,
lower.tail = T # P[X≤x]
)
## [1] 0.4528515
# complement of standard cdf / lower.tail = F
phyper(q = 1,
m = 6,
n = 19,
k = 6,
lower.tail = F # P[X>x]
)
## [1] 0.4528515
0.4529.
i. Identify the distribution
This is a Normal distribution problem, as mentioned in the question.
ii. Identify the parameters
Given \(\sigma^2 = 90,000\) and \(\mu=800\).
This implies \(\sigma=sqrt(90000)=300\)
iii. See what is the Probability Statement Required to be Solved so that you know what to compute
Probability that the weight of a randomly selected steer is between 1040 and 1460 lbs. \(P(1040<X<1460) | \mu=800, \sigma = 300)\)
iv. Compute/Evaluate
First, lets plot the normal distribution.
Plotting normal distribution in R
#define population mean and standard deviation
population_mean <- 800
population_sd <- 300
#### FOR GRAPHING, else draw a rough figure
#define upper and lower bound
lower_bound <- population_mean - population_sd
upper_bound <- population_mean + population_sd
#Create a sequence of 1000 x values based on population mean and standard deviation
?seq()
x <- seq(from = -4,
to = 4,
length = 1000) * population_sd + population_mean
#create a vector of values that shows the height of the probability distribution for each value in x
y <- dnorm(x = x, # argument = vector x created by seq() function
mean = population_mean,
sd = population_sd
)
#plot normal distribution with customized x-axis labels
plot(x = x,
y = y,
type = "l",
lwd = 2,
axes = FALSE,
xlab = "",
ylab = ""
)
sd_axis_bounds <- 5
axis_bounds <- seq(from = -sd_axis_bounds * population_sd + population_mean,
to = sd_axis_bounds * population_sd + population_mean,
by = population_sd)
axis(side = 1, at = axis_bounds, pos = 0)
# Shade the area between 1040 and 1060
?seq
x_shade <- seq(from = 1040, to = 1460, by=(length.out=10) )
head(x_shade)
## [1] 1040 1050 1060 1070 1080 1090
y_shade <- dnorm(x = x_shade, mean = population_mean, sd = population_sd )
head(y_shade)
## [1] 0.0009656385 0.0009397063 0.0009134549 0.0008869508 0.0008602594
## [6] 0.0008334447
polygon(x = c(1040, x_shade,1460),
y = c(0,y_shade,0),
col = "gray",
border = NA)
sum(dnorm(x= 1040:1460, # THIS IS AN APPROXIMATION AS WE WANT THE x values to be between 1040 and 1460
mean = 800,
sd = 300)
) # Brute Force not the best method here as it is a continuous distribution...it is an approximation, and should be avoided.
## [1] 0.198494
# MOST DIRECT AND BEST METHOD
pnorm(q = 1460,
mean = 800,
sd = 300) - pnorm(q = 1040,
mean = 800,
sd = 300)
## [1] 0.197952
# Better than dnorm as normal is continous distribution, not discrete.
pnorm(q = 1040,
mean = 800,
sd = 300,
lower.tail = F) - pnorm(q = 1460,
mean = 800,
sd = 300,
lower.tail = F)
## [1] 0.197952
round( x = (pnorm(q = 1460, mean = 800,sd = sqrt(90000))-pnorm(q = 1040,mean = 800,sd = sqrt(90000)) ),digits = 4)
## [1] 0.198
0.198
i. Identify the distribution
This is a Normal distribution problem, as mentioned in the question.
ii. Identify the parameters
Given \(\sigma = 4\) and \(\mu=106\).
iii. See what is the Probability Statement Required to be Solved so that you know what to compute
Probability that the diameter of a selected bearing is between 103 and 111 millimeters. \(P(103<X<111 | \mu=106, \sigma = 4 )\)
iv. Compute/Evaluate
#define population mean and standard deviation
population_mean <- 106
population_sd <- 4
#### FOR GRAPHING
#define upper and lower bound
lower_bound <- population_mean - population_sd
upper_bound <- population_mean + population_sd
#Create a sequence of 1000 x values based on population mean and standard deviation
x <- seq(from = -4,
to = 4,
length = 1000) * population_sd + population_mean
#create a vector of values that shows the height of the probability distribution
#for each value in x
y <- dnorm(x = x,
mean = population_mean,
sd = population_sd)
#plot normal distribution with customized x-axis labels
plot(x = x,
y = y,
type = "l",
lwd = 2,
axes = FALSE,
xlab = "",
ylab = ""
)
sd_axis_bounds <- 5
axis_bounds <- seq(from = -sd_axis_bounds * population_sd + population_mean,
to = sd_axis_bounds * population_sd + population_mean,
by = population_sd)
axis(side = 1, at = axis_bounds, pos = 0)
# Shade the area between 103 and 111
?seq
x_shade <- seq(103, 111, length.out=100 )
head(x_shade)
## [1] 103.0000 103.0808 103.1616 103.2424 103.3232 103.4040
y_shade <- dnorm(x = x_shade, mean = population_mean, sd = population_sd )
head(y_shade)
## [1] 0.07528436 0.07641812 0.07753730 0.07864078 0.07972741 0.08079608
polygon(x = c(103, x_shade,111),
y = c(0,y_shade,0),
col = "gray",
border = NA)
x <- 103:111
x
## [1] 103 104 105 106 107 108 109 110 111
dnorm(x = 103:111,
mean = 106,
sd = 4
)
## [1] 0.07528436 0.08801633 0.09666703 0.09973557 0.09666703 0.08801633 0.07528436
## [8] 0.06049268 0.04566227
?dnorm
sum(dnorm(x = 103:111,
mean = 106,
sd = 4)
) # Brute Force not the best method here, it is a continuous distribution...should be avoided...
## [1] 0.725826
# if you really want to use discrete method, you will have to evaluate each observation at its midpoint - 103.5 will represent 103 to 104, 104.5 will represent 104 to 105,...
sum(dnorm(x = 103.5:110.5, # 8 points evaluated at the midpoints
mean = 106,
sd = 4)
)
## [1] 0.6689098
# MOST DIRECT AND BEST METHOD
pnorm(q = 111,
mean = 106,
sd = 4) - pnorm(q=103,
mean = 106,
sd = 4)
## [1] 0.6677229
#### shorter way to do the same thing...
?diff # Returns suitably lagged and iterated differences.
diff(x = pnorm(c(103,111), mean = 106, sd = 4) # a numeric vector or matrix containing the values to be differenced.
)
## [1] 0.6677229
pnorm(q = 103,
mean = 106,
sd = 4,
lower.tail = F) - pnorm(q = 111,
mean = 106,
sd = 4,
lower.tail = F)
## [1] 0.6677229
round( x = ( pnorm(q = 111,mean = 106,sd = 4) - pnorm(q = 103,mean = 106,sd = 4) ),
digits = 4)
## [1] 0.6677
0.6677
You will have to use the quantile function, qnorm() here.
i. Identify the distribution
This is a Normal distribution problem, as mentioned in the question.
ii. Identify the parameters
\(\mu=3.34, \sigma=.07\)
iii. find the two lengths that separate the top 3% and the bottom 3%.
Use the ?qnorm function.
iv. Compute/Evaluate
What is a quantile? Quantile is a statistical term used to describe the division of a set of numerical values into subgroups, called quantiles, so that each subgroup contains approximately the same number of observations. The most common quantile is the median, which splits a dataset into two equal halves. Other common quantiles include quartiles (splitting the data into 4 equal parts), deciles (10 equal parts), and percentiles (100 equal parts). Quantiles can provide valuable information about the distribution of data, such as skewness, kurtosis, and the range of values.
What is quantile function?
A quantile function, also known as a quantile map or an inverse cumulative distribution function, is a mathematical function that maps a probability value to the corresponding quantile value in a distribution. In other words, given a probability value, the quantile function returns the value at which that probability of the data is exceeded. For example, the 0.75 quantile, also known as the 75th percentile, is the value below which 75% of the data falls.
Quantile functions are commonly used in statistics and data analysis to summarize the distribution of a dataset, and to compare the distributions of different datasets. For a continuous distribution, the quantile function is unique and can be represented analytically. For discrete distributions, quantile functions may not exist, or may not be unique, and are often estimated from the data.
?qnorm
qnorm(p = .03, mean = 3.34, sd = .07)
## [1] 3.208344
qnorm(p = .97, mean = 3.34, sd = .07)
## [1] 3.471656
round(x = c( qnorm(p = .03, mean = 3.34, sd = .07),
qnorm(p = .97, mean = 3.34, sd = .07)
),
digits = 2)
## [1] 3.21 3.47
3.21 and 3.47 separate the bottom 3% and the top 3%.
Furthermore, We know normal distribution is symmetric. In fact, the 50th percentile / middle 50% data value in ascending order ranked right in the middle of 3 and 97 percentile values.
qnorm(p = c (.03,.5,.97), # Let p be the vector of probabilities. Here, the vector represents 3, 50 and 97 percentile.
mean = 3.34,
sd = .07
)
## [1] 3.208344 3.340000 3.471656
#define population mean and standard deviation
population_mean <- 3.34
population_sd <- .07
# Store quantiles for 3rd, 50th, and 97th percentiles
quantiles <- qnorm(p = c(0.03, 0.5, 0.97), mean = population_mean, sd = population_sd)
#define upper and lower bound
lower_bound <- population_mean - population_sd
upper_bound <- population_mean + population_sd
#Create a sequence of 1000 x values based on population mean and standard deviation
x <- seq(from = -4, to = 4, length = 1000) * population_sd + population_mean
#create a vector of values that shows the height of the probability distribution
#for each value in x
y <- dnorm(x, population_mean, population_sd)
#plot normal distribution with customized x-axis labels
plot(x,y, type = "l", lwd = 2, axes = FALSE, xlab = "", ylab = "")
sd_axis_bounds = 5
axis_bounds <- seq(from = -sd_axis_bounds * population_sd + population_mean,
to = sd_axis_bounds * population_sd + population_mean,
by = population_sd)
axis(side = 1, at = axis_bounds, pos = 0)
# Shade areas corresponding to the 3rd, 50th, and 97th percentiles
polygon(x = c(x[x <= quantiles[1]], quantiles[1]),
y = c(y[x <= quantiles[1]], 0),
col = "red",
border = NA)
polygon(x = c(quantiles[3], x[x >= quantiles[3]]),
y = c(0, y[x >= quantiles[3]]),
col = "green",
border = NA)
# Find the length separating the top 3%
top_3_percent <- qnorm(0.97, mean = population_mean, sd = population_sd )
# Find the length separating the bottom 3%
bottom_3_percent <- qnorm(0.03, mean = population_mean, sd = population_sd )
# Print the results
cat("The length separating the top 3% of nails:", round(top_3_percent, 2), "cm\n")
## The length separating the top 3% of nails: 3.47 cm
cat("The length separating the bottom 3% of nails:", round(bottom_3_percent, 2), "cm\n")
## The length separating the bottom 3% of nails: 3.21 cm
Scores on the test are normally distributed with a mean of 75.8 and a standard deviation of 8.1. Find the minimum score required for an A grade. Round your answer to the nearest whole number, if necessary.
i. Identify the distribution
This is a Normal distribution problem, as mentioned in the question.
ii. Identify the parameters
\(\mu=75.8, \sigma=8.1\)
iii. Minimum score required for an A grade
\(91^{st}\) percentile.
iv. Compute/Evaluate
ANS <- qnorm(p = .91, # p is vector of probabilities.
mean = 75.8,
sd = 8.1
)
round(x = ANS,
digits = 0
)
## [1] 87
87
Visually -
# Define population mean and standard deviation
population_mean <- 75.8
population_sd <- 8.1
# Find the minimum score required for an A grade (91st percentile)
min_A_score <- qnorm(p = 0.91, mean = population_mean, sd = population_sd)
# Create a sequence of x values
x <- seq(from = population_mean - 3 * population_sd,
to = population_mean + 3 * population_sd,
length.out = 1000
)
# Compute the corresponding y values (probability density function)
y <- dnorm(x, mean = population_mean, sd = population_sd)
# Plot the normal distribution
plot(x, y, type = "l", lwd = 2, main = "Normal Distribution of Test Scores", xlab = "Test Scores", ylab = "Probability Density")
# Shade the area corresponding to the top 9% of scores (A grade)
polygon(x = c(min_A_score, x[x >= min_A_score]),
y = c(0, y[x >= min_A_score]),
col = "lightblue"
)
# Add text to label the shaded area
text(x = min_A_score + 2, y = max(y) * 0.2, labels = "A Grade", col = "blue")
# Add a vertical line to mark the minimum score for an A grade
abline(v = min_A_score, col = "blue", lty = 2)
i. Identify the distribution
ii. Identify the parameters
Let random variable \(X\) measure the count of no crashes. Given \(\pi = 61\% = .61\) is the success, as usual (here - no computer crash). Also, given \(n = 155\), the number of trails.
n <- 155
pi <- .61
MEAN <- n * pi # approximately
SD <- sqrt(pi * (1-pi) * n ) # approximately
iii. See what is the Probability Statement Required to be Solved so that you know what to compute
\(X\) measures no crash. \(P (X = 96 | \pi = .61, n=155 )\)
iv. Compute/Evaluate
# USE DNORM IF normal
# if normal, mean = n * p ; sd = sqrt (var) = sqrt (p * (1-p) * n)
dnorm(x = 96, # 95.9:96.1
mean = .61 * 155,
sd = sqrt(.61 * .39 * 155 )
)
## [1] 0.06385071
# Alternatively approximate by centering around X=96 (add and subtract .5 from the point estimate - not the recommended approach)
pnorm(q = 96.5, mean = .61*155, sd = sqrt(.61 * .39 * 155) ) -
pnorm(q = 95.5, mean = .61*155, sd = sqrt(.61 * .39 * 155) )
## [1] 0.06378274
# USE DIBINOM IF binomial
dbinom( x = 96,
size = 155,
prob = .61
)
## [1] 0.06402352
Normal ( 0.06378274 ) approximates the binomial probability well in this case ( 0.06402352 ) as n is large - both np and n(1-p) are larger than 10.
Show n
is large and \(\pi\) is small -
n <- 155
p <- .61
n * p > 10
## [1] TRUE
n * (1-p) > 10
## [1] TRUE
Visually, they are the same when large sample and small proportion -
# Define parameters
n <- 155
pi <- 0.61
# Compute mean and standard deviation for the normal approximation
mean_normal <- n * pi
sd_normal <- sqrt(pi * (1 - pi) * n)
# Create a sequence of x values
x <- seq(from = 0, to = n, by = 1)
# Compute the corresponding probabilities for the binomial distribution
prob_binomial <- dbinom(x, size = n, prob = pi)
# Compute the corresponding probabilities for the normal distribution approximation
prob_normal <- dnorm(x, mean = mean_normal, sd = sd_normal)
# Set up the grid layout
par(mfrow = c(2, 1))
# Plot binomial distribution
plot(x, prob_binomial, type = "h", lwd = 2, col = "blue",
main = "Binomial Distribution",
xlab = "Number of Computers Not Crashing",
ylab = "Probability")
# Plot normal approximation
plot(x, prob_normal, type = "h", lwd = 2, col = "red",
main = "Normal Approximation",
xlab = "Number of Computers Not Crashing",
ylab = "Probability")