rm(list = ls())   # Clear environment 
gc()              # Clear unused memory
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 531902 28.5    1185087 63.3         NA   669265 35.8
## Vcells 983123  7.6    8388608 64.0      16384  1840436 14.1
cat("\f")         # Clear the console
dev.off           # Clear the charts
## function (which = dev.cur()) 
## {
##     if (which == 1) 
##         stop("cannot shut down device 1 (the null device)")
##     .External(C_devoff, as.integer(which))
##     dev.cur()
## }
## <bytecode: 0x111bebe50>
## <environment: namespace:grDevices>

It would be very helpful if you could plot the distributions before calculating the probabilities. Begin with reading up on the plot() function.

These questions will help you build an understanding of Normal, Binomial, Hypergeometric and Poisson distribution.

You will be using the probability density function, cumulative density function and quantile function in this assignment.

1. A researcher wishes to conduct a study of the color preferences of new car buyers. Suppose that 50% of this population prefers the color red. If 20 buyers are randomly selected, what is the probability that between 9 and 12 (both inclusive) buyers would prefer red? Round your answer to four decimal places.

i. Identify the distribution

  • This is a binomial distribution problem. The binomial distribution is used to describe the number of successes in a fixed number of trials. This is different from the geometric distribution, which described the number of trials we must wait before we observe a success.

  • Check its assumptions quickly.

    • There are only two possible outcomes on a particular trial of an experiment (person likes or does not like the color red).
    • The outcomes are mutually exclusive.
    • The random variable is the result of counts.
    • Each trial is independent of any other trial, although independence is not too much of a concern if the sample is less than 1/20th of the population (simulation studies).

ii. Identify the parameters

Given \(\pi = 50\% = .5\) is the true preference. Also, given \(n = 20\), the number of trails.

iii. See what is the Probability Statement Required to be Solved so that you know what to compute

Let X be the count of buyers who prefer red color.

In Words: Probability that between 9 and 12 (both inclusive) buyers would prefer red.

In Math : \(P( 9 \leq X \leq 12 | n=20, \pi =.5 )\)

iv. Compute/Evaluate

Helpful to plot the distribution - see Graphing Binomial Distribution in R

?plot
## Help on topic 'plot' was found in the following packages:
## 
##   Package               Library
##   graphics              /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
##   base                  /Library/Frameworks/R.framework/Resources/library
## 
## 
## Using the first match ...
# Generate binomial distribution data
x_values  <- 0:20
y_values <- dbinom(x = x_values, 
                   size = 20,
                   prob = 0.5
                   )

# Bare bones chart 
plot(x = x_values, 
     y = y_values
)

An embellished chart with the area that we want as shaded -

# Set colors based on condition
colors <- ifelse(test = x_values <= 12 & x_values >= 9,
                 yes = "red",
                 no =  "lightblue"
                 )

# Plot the binomial distribution with conditional coloring
plot(x = x_values, 
     y = y_values, 
     type = 'h',
     main = 'Binomial Distribution (n=20, p=0.5)',
     ylab = 'Probability',
     xlab = '# Successes',
     col = colors,  # Specify colors
     lwd = 3
)

Three methods to compute -

I. Brute Force Approach - Add PMFs

We will have to add up the probability mass function for the \(9^{th},10^{th},11^{th}\) and the \(12^{th}\) buyer.

?dbinom
dbinom(x = 0:20, size = 20,prob = .5)
##  [1] 9.536743e-07 1.907349e-05 1.811981e-04 1.087189e-03 4.620552e-03
##  [6] 1.478577e-02 3.696442e-02 7.392883e-02 1.201344e-01 1.601791e-01
## [11] 1.761971e-01 1.601791e-01 1.201344e-01 7.392883e-02 3.696442e-02
## [16] 1.478577e-02 4.620552e-03 1.087189e-03 1.811981e-04 1.907349e-05
## [21] 9.536743e-07
?options # Allow the user to set and examine a variety of global options which affect the way in which R computes and displays its results.
options(scipen=999) #prevent R from displaying numbers in scientific notation 

# define range of successes
success <- 9:12

# display probability of success for each number of trials
dbinom(x    = success, # or you can do x = 9:12
       size = 20, 
       prob = .5
      )
## [1] 0.1601791 0.1761971 0.1601791 0.1201344
# add all the probabilities
sum(dbinom(x = 9:12, size = 20, prob = .5))
## [1] 0.6166897
round(x      = sum(dbinom(x    = 9:12, 
                          size = 20, 
                          prob = .5)
                   ),
      digits = 4
      )  # if you want to get more fancy with rounding
## [1] 0.6167

.6167

II. CDF Force Approach - Substract (lower tail)

Another way to do get the same answer - instead of adding pmfs, subtract cdf of 8 consumers from cdf of 12 consumers.

Plot the charts -

# Set up a grid layout with 1 row and 2 columns
par(mfrow = c(1, 2))

# Generate binomial distribution data
x <- 0:20
y <- dbinom(x, size = 20, prob = 0.5)

# Plot the first chart with x < 8 as red bars
plot(x = x, 
     y = y, 
     type = 'h',
     main = 'Binomial Distribution (n=20, p=0.5)',
     ylab = 'Probability',
     xlab = '# Successes',
     col = ifelse(test = x < 8,
                  yes =  "red", 
                  no = "black"),  # Color bars where x < 8 as red
     lwd = 3
)

# Plot the second chart with x < 12 as light blue bars
plot(x = x, 
     y = y, 
     type = 'h',
     main = 'Binomial Distribution (n=20, p=0.5)',
     ylab = 'Probability',
     xlab = '# Successes',
     col = ifelse(test = x < 12, 
                  yes = "lightblue", 
                  no = "black"),  # Color bars where x < 12 as light blue
     lwd = 3
)

Compute the probability -

?pbinom 

pbinom(q    = 12,
       size = 20, 
       prob = .5,
       lower.tail = TRUE    # lower.tail    logical; if TRUE (default), probabilities are P[X≤x]  = 12
       )  - 
pbinom(q    = 8, 
       size = 20, 
       prob = .5,
       lower.tail = TRUE    # lower.tail    logical; if TRUE (default), probabilities are P[X≤x]  = 12
       ) 
## [1] 0.6166897

Note due to discrete nature of the distribution, we have to adjust for \(q=9\) to \(q=9-1=8\).

III. CDF Force Approach - Substract (upper tail)

# lower.tail = FALSE  logical;  P[X>x].
pbinom(q    = 8,
       size = 20, 
       prob = .5,
       lower.tail = FALSE)   - 
pbinom(q    = 12, 
       size = 20, 
       prob = .5,
       lower.tail = FALSE   
       ) 
## [1] 0.6166897
# Set up a grid layout with 1 row and 2 columns
par(mfrow = c(1, 2))

# Plot the first chart with x > 8 as red bars
plot(x = x, 
     y = y, 
     type = 'h',
     main = 'Binomial Distribution (n=20, p=0.5)',
     ylab = 'Probability',
     xlab = '# Successes',
     col = ifelse(x > 8, "red", "black"),  # Color bars where x > 8 as red
     lwd = 3
)

# Plot the second chart with x > 12 as light blue bars
plot(x = x, 
     y = y, 
     type = 'h',
     main = 'Binomial Distribution (n=20, p=0.5)',
     ylab = 'Probability',
     xlab = '# Successes',
     col = ifelse(x > 12, "lightblue", "black"),  # Color bars where x > 12 as light blue
     lwd = 3
)

2. A quality control inspector has drawn a sample of 13 light bulbs from a recent production lot. Suppose 20% of the bulbs in the lot are defective. What is the probability that less than 6 but more than 3 bulbs from the sample are defective?

Round your answer to four decimal places.

i. Identify the distribution

Again, this is a binomial distribution problem. The binomial distribution is used to describe the number of successes in a fixed number of trials, and here the success is defined as bulb being defective. If that helps, you can think of the bulb being “special” and not necessarily bad/“failure” from a manufacturing perspective.

  • Check its assumptions quickly.
    • There are only two possible outcomes on a particular trial of an experiment (bulb defective or not).
    • The outcomes are mutually exclusive.
    • The random variable is the result of counts.
    • Each trial is independent of any other trial…

ii. Identify the parameters

\(n=13\)

\(\pi = 20\% =.2\)

iii. See what is the Probability Statement Required to be Solved so that you know what to compute

Let X be the count of defective bulbs.

In Words : Probability that less than 6 but more than 3 bulbs from the sample are defective.

In Math: \(P(3<X<6 | n=13, \pi =.2 )\)

iv. Compute/Evaluate

I. Brute Force Approach - Add PMFs

?plot
## Help on topic 'plot' was found in the following packages:
## 
##   Package               Library
##   graphics              /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
##   base                  /Library/Frameworks/R.framework/Resources/library
## 
## 
## Using the first match ...
plot(x    = 0:13,                                 # x variable
     y   = dbinom(x     = 0:13,                   # y variable 
                   size = 13, 
                   prob = .2
                  ), 
     main = 'Binomial Distribution (n=13, p=0.2)', # title
     ylab = 'Probability',                         # y axis label
     xlab = '# Successes',                         # x axis label
     type = 'h',                                   # "h" for histogram-like vertical lines. Can try "l" if continuous distribution, or ignore
     lwd  = 6,                                    # line widths, can be ignored
     col  = ifelse(test =  x >= 4 & x < 6, 
                   yes = "red", 
                   no = "lightblue"
                   )
     )

dbinom(x = 4:5, size = 13, prob = .2)              # list probabilities
## [1] 0.15354508 0.06909529
sum(dbinom(x = 4:5, size = 13, prob = .2))         # sum probabilities
## [1] 0.2226404
round(x      = sum(dbinom(x    = 4:5, 
                          size = 13, 
                          prob = .2
                          )
                   ),
      digits = 4
      )
## [1] 0.2226

0.2226

II. CDF Force Approachs

# LOWER TAIL (default)

pbinom(q    = 5, 
       size = 13, 
       prob = .2) -
pbinom(q    = 3, 
       size = 13, 
       prob = .2)
## [1] 0.2226404
# UPPER TAIL (default)
pbinom(q    = 3, 
       size = 13, 
       prob = .2, 
       lower.tail = FALSE
       ) -
pbinom(q    = 5, 
       size = 13, 
       prob = .2,
       lower.tail = FALSE)
## [1] 0.2226404

3. The auto parts department of an automotive dealership sends out a mean of 4.2 special orders daily. What is the probability that, for any day, the number of special orders sent out will be no more than 3? Round your answer to four decimal places.

i. Identify the distribution

This is a Poisson probability distribution problem. Poisson distribution describes the number of times some event occurs during a specified interval. The interval may be time, distance, area, or volume. The Poisson is actually an approximation of the binomial distribution for very rare events.

The Poisson distribution is often useful for estimating the number of events in a large population over a unit of time. For instance, consider each of the following events: having a heart attack, getting married, and getting struck by lightning.

ii. Identify the parameters

\(\lambda=4.2\)

iii. See what is the Probability Statement Required to be Solved so that you know what to compute

Let X be the count of special orders sent out.

In Words : Probability that, for any day, the number of special orders sent out will be no more than 3?

In Math : \(P (X \leq 3 |\lambda=4.2)\)

iv. Compute/Evaluate

Helpful to plot the distribution - see Graphing Poisson Distribution in R

range <- 0:10     # choose a large number arbitrarily for range of successes

plot(x    = range, 
     y    = dpois(x      = range, 
                  lambda = 4.2
                  ),
     main = 'Poisson Distribution (lambda = 4.2)',
     ylab = 'Probability',
     xlab = '# Successes',
     type = 'h',
     lwd  = 3, 
     col  = ifelse(test = x <= 3, 
                   yes = "red", 
                   no = "lightblue"
                   )   
     )

dpois(x = range, lambda=4.2)
##  [1] 0.014995577 0.062981423 0.132260988 0.185165383 0.194423652 0.163315867
##  [7] 0.114321107 0.068592664 0.036011149 0.016805203 0.007058185

Three Different ways to get the sane answer.

sum(dpois(x      = 0:3, 
          lambda = 4.2)
    )               # Brute force - adding pdfs for every discrete event
## [1] 0.3954034
ppois(q          = 3,
      lambda     = 4.2, 
      lower.tail = T
      )      # alternatively, just use the cdf (area upto the value)
## [1] 0.3954034
1 - ppois(q          = 3,
          lambda     = 4.2, 
          lower.tail = F
          )  # also works for complements 
## [1] 0.3954034

0.3954

4. A pharmacist receives a shipment of 17 bottles of a drug and has 3 of the bottles tested. If 6 of the 17 bottles are contaminated, what is the probability that less than 2 of the tested bottles are contaminated? Express your answer as a fraction or a decimal number rounded to four decimal places.

i. Identify the distribution

This is a Hyper geometric distribution problem. Its properties include -

  • Sampling without replacement from a finite population
  • The number of objects in the population is denoted N.
  • Each trial has exactly two possible outcomes, success and failure.
  • Trials are not independent
  • X is the number of successes in the n trials
  • The binomial is an acceptable approximation if n < 5% N. Otherwise it is not.

ii. Identify the parameters.

Let X be the number of contaminated bottles in sample. Thus, note contamination is ‘success’ here.. Recall - + N is population size: 17 # used in calculation of S and F, thus included in parameterization… + n is sample size: 3 + S is number of successes in population: 6 + Thus, F is the number of failure in population, N-S = 17-6 = 11 + x is number of successes in sample: less than 2

iii. See what is the Probability Statement Required to be Solved so that you know what to compute

Probability that less than 2 of the tested bottles are contaminated.

\(P(X<2 \ | \ S \ (Success \ in \ Population) = 6, F \ (Failure \ in \ Population) = N - S = 17 - 6 = 11, n \ (sample \ size)= 3 \ )\)

Note we have used the the info that \(N = 17\).

iv. Compute/Evaluate

Helpful to plot the distribution - see Graphing Hyper Geometric Distribution in R

# Specify x-values for dhyper function
?seq()
x_dhyper <- 0:17
  
# Apply dhyper function
?dhyper
y_dhyper <- dhyper(x = x_dhyper,    # success in sample / event of interest // the number of white balls drawn without replacement from an urn which contains both black and white balls.
                   m = 6,           # Success in Population // the number of white balls in the urn.
                   n = 17 - 6,      # Failure in Population // the number of black balls in the urn.
                   k = 3            # sample size // the number of balls drawn from the urn, hence must be in 0,1,...m+n
                   )   
   
# Plot dhyper values
plot(x    = x_dhyper,
     y    = y_dhyper,
     type = 'h',
     main = 'Hypergeometric Distribution (S = 6, F = 11, n = 3)',
     ylab = 'Probability',
     xlab = '# Successes',
     lwd  = 3, 
     col  = ifelse(test =  x <= 1, # less than 2 is 1 or less 
                   yes = "red", 
                   no = "lightblue"
                   )
     ) 

Note PMF falls to 0 after \(X>3\), as expected.
The prob of \(Prob(X=0)=0.24264, Prob(X=1)=0.485, Prob(X=2)=0.24264, Prob(X=3)=0.02941176\).

dhyper(x = x_dhyper,       # success in sample / event of interest
           m = 6,          # Success in Population
           n = 17-6,       # Failure in Population
           k = 3           # sample size
           )
##  [1] 0.24264706 0.48529412 0.24264706 0.02941176 0.00000000 0.00000000
##  [7] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## [13] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000

Three Different ways to get the same answer.

?dhyper

# Brute Force:  P(x = event of interest | m = Success in pop , n = Failure in pop , k = sample size) 
sum(dhyper(x = 0:1,        # success in sample / event of interest
           m = 6,          # Success in Population
           n = 17-6,       # Failure in Population
           k = 3           # sample size
           )
    )       
## [1] 0.7279412
# standard cdf 
phyper(q = 1, 
       m = 6, 
       n = 17-6,
       k = 3, 
       lower.tail = T)      
## [1] 0.7279412
# cdf (upper tail)/complement of cdf
1 - phyper(q = 1, 
           m=6, 
           n=17-6,
           k=3, 
           lower.tail = F)    
## [1] 0.7279412

0.7279

5. A town recently dismissed 6 employees in order to meet their new budget reductions. The town had 6 employees over 50 years of age and 19 under 50. If the dismissed employees were selected at random, what is the probability that more than 1 employee was over 50? Round your answer to four decimal places.

i. Identify the distribution

This is a Hyper geometric distribution problem.

ii. Identify the parameters

Let X be count of employee was over 50. Note employee over 50 is ‘success’ here..

Recall - + N is population size: 25 # used in calculation of S and F, thus included in parametrization… \(19 + 6 =25\) + n is sample size: 6 + S is number of successes in population: 6 + Thus, F is the number of failure in population, N-S = 25-6 = 19 + x is number of successes in sample: 1

iii. See what is the Probability Statement Required to be Solved so that you know what to compute

Let X be the count of employees over 50.

\(P(X>1 | S = 6, F = 19, n = 6)\)

Note \(N = 25\) already used in the probability statement above (in the F argument).

iv. Compute/Evaluate

# Specify x-values for dhyper function
?seq()
x_dhyper <- seq(from = 0, 
                to   = 25, 
                by   = 1
                )
  
# Apply dhyper function
?dhyper
y_dhyper <- dhyper(x = x_dhyper,  # success in sample / event of interest
                   m = 6,         # Success in Population
                   n = 19,        # Failure in Population
                   k = 6.         # sample size
                   )   
   
# Plot dhyper values
plot(x    = x_dhyper,
     y    = y_dhyper,
     type = 'h',
     main = 'Hypergeometric Distribution (S = 6, F = 19, n = 6)',
     ylab = 'Probability',
     xlab = '# Successes',
     lwd  = 3, 
     col  = ifelse(test =  x > 1, 
                   yes = "red", 
                   no = "lightblue"
                   )
     ) 

Again, the probability falls down to zero when \(x>n\) in dhyper arguments.

dhyper(x = 0:25,m = 6,n = 19,k = 6)
##  [1] 0.153201581028 0.393946922643 0.328289102202 0.109429700734 0.014483342744
##  [6] 0.000643704122 0.000005646527 0.000000000000 0.000000000000 0.000000000000
## [11] 0.000000000000 0.000000000000 0.000000000000 0.000000000000 0.000000000000
## [16] 0.000000000000 0.000000000000 0.000000000000 0.000000000000 0.000000000000
## [21] 0.000000000000 0.000000000000 0.000000000000 0.000000000000 0.000000000000
## [26] 0.000000000000

Three Different ways to get the same answer.

# Brute Force:  P(x = 2,3,4,5,6 | m = Success in pop , n = Failure in pop , k = sample size) 
sum(dhyper(x = 2:6,     # success in sample / event of interest
           m = 6,       # Success in Population
           n = 19,      # Failure in Population
           k = 6        # Sample Size
           ) 
    )         
## [1] 0.4528515
# standard cdf / lower.tail = T, but Question asks for 2 or more people in the sample...thus subtract 1
1 - phyper(q = 1,
           m = 6,
           n = 19,
           k = 6, 
           lower.tail = T # P[X≤x]
           )  
## [1] 0.4528515
# complement of standard cdf / lower.tail = F
    phyper(q = 1,
           m = 6,
           n = 19,
           k = 6, 
           lower.tail = F # P[X>x]
           )  
## [1] 0.4528515

0.4529.

6. The weights of steers in a herd are distributed normally. The variance is 90,000 and the mean steer weight is 800 lbs. Find the probability that the weight of a randomly selected steer is between 1040 and 1460 lbs. Round your answer to four decimal places.

i. Identify the distribution

This is a Normal distribution problem, as mentioned in the question.

ii. Identify the parameters

Given \(\sigma^2 = 90,000\) and \(\mu=800\).

This implies \(\sigma=sqrt(90000)=300\)

iii. See what is the Probability Statement Required to be Solved so that you know what to compute

Probability that the weight of a randomly selected steer is between 1040 and 1460 lbs. \(P(1040<X<1460) | \mu=800, \sigma = 300)\)

iv. Compute/Evaluate

First, lets plot the normal distribution.

Plotting normal distribution in R

#define population mean and standard deviation
population_mean  <-   800
population_sd    <-   300


#### FOR GRAPHING, else draw a rough figure

#define upper and lower bound
lower_bound   <-  population_mean - population_sd
upper_bound   <-  population_mean + population_sd

#Create a sequence of 1000 x values based on population mean and standard deviation
?seq()
x <- seq(from   = -4, 
         to     = 4, 
         length = 1000) * population_sd + population_mean

#create a vector of values that shows the height of the probability distribution for each value in x
y <- dnorm(x = x,    # argument = vector x created by seq() function
           mean = population_mean, 
           sd = population_sd
           )

#plot normal distribution with customized x-axis labels
plot(x = x,
     y = y, 
     type = "l", 
     lwd = 2, 
     axes = FALSE, 
     xlab = "", 
     ylab = ""
     )

sd_axis_bounds <- 5
axis_bounds <- seq(from = -sd_axis_bounds * population_sd + population_mean,
                     to =  sd_axis_bounds * population_sd + population_mean,
                     by = population_sd)

axis(side = 1, at = axis_bounds, pos = 0)

# Shade the area between 1040 and 1060
?seq
x_shade <- seq(from = 1040, to = 1460, by=(length.out=10) )
  head(x_shade)
## [1] 1040 1050 1060 1070 1080 1090
y_shade <- dnorm(x = x_shade, mean = population_mean, sd = population_sd )
  head(y_shade)
## [1] 0.0009656385 0.0009397063 0.0009134549 0.0008869508 0.0008602594
## [6] 0.0008334447
polygon(x = c(1040, x_shade,1460), 
        y = c(0,y_shade,0),
        col = "gray", 
        border = NA)

CDF approaches will be better here.

sum(dnorm(x= 1040:1460,  # THIS IS AN APPROXIMATION AS WE WANT THE x values to be between 1040 and 1460
          mean = 800,
          sd = 300)
    )  # Brute Force not the best method here as it is a continuous distribution...it is an approximation, and should be avoided.
## [1] 0.198494
# MOST DIRECT AND BEST METHOD 
  pnorm(q    = 1460, 
        mean = 800, 
        sd   = 300) -  pnorm(q    = 1040, 
                             mean = 800, 
                             sd   = 300)
## [1] 0.197952
# Better than dnorm as normal is continous distribution, not discrete.
  pnorm(q    = 1040, 
        mean = 800, 
        sd   = 300, 
        lower.tail = F) -  pnorm(q    = 1460, 
                                 mean = 800, 
                                 sd   = 300, 
                                 lower.tail = F) 
## [1] 0.197952
round( x = (pnorm(q = 1460, mean = 800,sd = sqrt(90000))-pnorm(q = 1040,mean = 800,sd = sqrt(90000)) ),digits = 4)
## [1] 0.198

0.198

7. The diameters of ball bearings are distributed normally. The mean diameter is 106 millimeters and the standard deviation is 4 millimeters. Find the probability that the diameter of a selected bearing is between 103 and 111 millimeters. Round your answer to four decimal places.

i. Identify the distribution

This is a Normal distribution problem, as mentioned in the question.

ii. Identify the parameters

Given \(\sigma = 4\) and \(\mu=106\).

iii. See what is the Probability Statement Required to be Solved so that you know what to compute

Probability that the diameter of a selected bearing is between 103 and 111 millimeters. \(P(103<X<111 | \mu=106, \sigma = 4 )\)

iv. Compute/Evaluate

#define population mean and standard deviation
population_mean     <-    106
population_sd       <-    4


#### FOR GRAPHING 
#define upper and lower bound
lower_bound   <-  population_mean - population_sd
upper_bound   <-  population_mean + population_sd

#Create a sequence of 1000 x values based on population mean and standard deviation
x <- seq(from   = -4, 
         to     = 4, 
         length = 1000) * population_sd + population_mean

#create a vector of values that shows the height of the probability distribution
#for each value in x
y <- dnorm(x    = x, 
           mean = population_mean, 
           sd   = population_sd)

#plot normal distribution with customized x-axis labels
plot(x = x,
     y = y, 
     type = "l", 
     lwd = 2, 
     axes = FALSE, 
     xlab = "", 
     ylab = ""
     )
sd_axis_bounds <- 5
axis_bounds <- seq(from = -sd_axis_bounds * population_sd + population_mean,
                     to =  sd_axis_bounds * population_sd + population_mean,
                     by = population_sd)

axis(side = 1, at = axis_bounds, pos = 0)


# Shade the area between 103 and 111
?seq
x_shade <- seq(103, 111, length.out=100 )
  head(x_shade)
## [1] 103.0000 103.0808 103.1616 103.2424 103.3232 103.4040
y_shade <- dnorm(x = x_shade, mean = population_mean, sd = population_sd )
  head(y_shade)
## [1] 0.07528436 0.07641812 0.07753730 0.07864078 0.07972741 0.08079608
polygon(x = c(103, x_shade,111), 
        y = c(0,y_shade,0),
        col = "gray", 
        border = NA)

x  <-  103:111
x
## [1] 103 104 105 106 107 108 109 110 111
dnorm(x    = 103:111, 
      mean = 106, 
      sd   = 4
      )
## [1] 0.07528436 0.08801633 0.09666703 0.09973557 0.09666703 0.08801633 0.07528436
## [8] 0.06049268 0.04566227
?dnorm
sum(dnorm(x    = 103:111, 
          mean = 106, 
          sd   = 4)
    )  # Brute Force not the best method here, it is a continuous distribution...should be avoided...
## [1] 0.725826
# if you really want to use discrete method, you will have to evaluate each observation at its midpoint - 103.5 will represent 103 to 104, 104.5 will represent 104 to 105,... 
sum(dnorm(x    = 103.5:110.5, # 8 points evaluated at the midpoints 
          mean = 106, 
          sd   = 4)
    )
## [1] 0.6689098
# MOST DIRECT AND BEST METHOD 
  pnorm(q = 111, 
        mean = 106, 
        sd = 4) -  pnorm(q=103, 
                         mean = 106, 
                         sd = 4)
## [1] 0.6677229
#### shorter way to do the same thing...  
?diff  # Returns suitably lagged and iterated differences.
    diff(x = pnorm(c(103,111), mean = 106, sd = 4)  # a numeric vector or matrix containing the values to be differenced.
         )  
## [1] 0.6677229
pnorm(q = 103, 
      mean = 106, 
      sd = 4, 
      lower.tail = F) -  pnorm(q = 111, 
                               mean = 106, 
                               sd = 4, 
                               lower.tail = F) 
## [1] 0.6677229
round( x      = ( pnorm(q = 111,mean = 106,sd = 4) - pnorm(q = 103,mean = 106,sd = 4) ),
       digits = 4) 
## [1] 0.6677

0.6677

8. The lengths of nails produced in a factory are normally distributed with a mean of 3.34 centimeters and a standard deviation of 0.07 centimeters. Find the two lengths that separate the top 3% and the bottom 3%. These lengths could serve as limits used to identify which nails should be rejected. Round your answer to the nearest hundredth, if necessary.

You will have to use the quantile function, qnorm() here.

i. Identify the distribution

This is a Normal distribution problem, as mentioned in the question.

ii. Identify the parameters

\(\mu=3.34, \sigma=.07\)

iii. find the two lengths that separate the top 3% and the bottom 3%.

Use the ?qnorm function.

iv. Compute/Evaluate

Quantile functions

What is a quantile? Quantile is a statistical term used to describe the division of a set of numerical values into subgroups, called quantiles, so that each subgroup contains approximately the same number of observations. The most common quantile is the median, which splits a dataset into two equal halves. Other common quantiles include quartiles (splitting the data into 4 equal parts), deciles (10 equal parts), and percentiles (100 equal parts). Quantiles can provide valuable information about the distribution of data, such as skewness, kurtosis, and the range of values.

What is quantile function?

A quantile function, also known as a quantile map or an inverse cumulative distribution function, is a mathematical function that maps a probability value to the corresponding quantile value in a distribution. In other words, given a probability value, the quantile function returns the value at which that probability of the data is exceeded. For example, the 0.75 quantile, also known as the 75th percentile, is the value below which 75% of the data falls.

Quantile functions are commonly used in statistics and data analysis to summarize the distribution of a dataset, and to compare the distributions of different datasets. For a continuous distribution, the quantile function is unique and can be represented analytically. For discrete distributions, quantile functions may not exist, or may not be unique, and are often estimated from the data.

?qnorm 

qnorm(p = .03, mean = 3.34, sd = .07)
## [1] 3.208344
qnorm(p = .97, mean = 3.34, sd = .07)
## [1] 3.471656
round(x      = c( qnorm(p = .03, mean = 3.34, sd = .07), 
                  qnorm(p = .97, mean = 3.34, sd = .07)
                  ), 
      digits = 2)
## [1] 3.21 3.47

3.21 and 3.47 separate the bottom 3% and the top 3%.

Furthermore, We know normal distribution is symmetric. In fact, the 50th percentile / middle 50% data value in ascending order ranked right in the middle of 3 and 97 percentile values.

qnorm(p    = c (.03,.5,.97),  # Let p be the vector of probabilities. Here, the vector represents 3, 50 and 97 percentile.  
      mean = 3.34, 
      sd   = .07
      )   
## [1] 3.208344 3.340000 3.471656
#define population mean and standard deviation
population_mean <- 3.34
population_sd   <- .07

# Store quantiles for 3rd, 50th, and 97th percentiles
quantiles <- qnorm(p = c(0.03, 0.5, 0.97), mean = population_mean, sd = population_sd)


#define upper and lower bound
lower_bound <- population_mean - population_sd
upper_bound <- population_mean + population_sd

#Create a sequence of 1000 x values based on population mean and standard deviation
x <- seq(from = -4, to = 4, length = 1000) * population_sd + population_mean

#create a vector of values that shows the height of the probability distribution
#for each value in x
y <- dnorm(x, population_mean, population_sd)

#plot normal distribution with customized x-axis labels
plot(x,y, type = "l", lwd = 2, axes = FALSE, xlab = "", ylab = "")
sd_axis_bounds = 5
axis_bounds <- seq(from = -sd_axis_bounds * population_sd + population_mean,
                     to =  sd_axis_bounds * population_sd + population_mean,
                     by = population_sd)
axis(side = 1, at = axis_bounds, pos = 0)


# Shade areas corresponding to the 3rd, 50th, and 97th percentiles
polygon(x = c(x[x <= quantiles[1]], quantiles[1]), 
        y = c(y[x <= quantiles[1]], 0), 
        col = "red", 
        border = NA)


polygon(x = c(quantiles[3], x[x >= quantiles[3]]), 
        y = c(0, y[x >= quantiles[3]]), 
        col = "green", 
        border = NA)

# Find the length separating the top 3%
top_3_percent <- qnorm(0.97, mean = population_mean, sd = population_sd )

# Find the length separating the bottom 3%
bottom_3_percent <- qnorm(0.03, mean = population_mean, sd = population_sd )

# Print the results
cat("The length separating the top 3% of nails:", round(top_3_percent, 2), "cm\n")
## The length separating the top 3% of nails: 3.47 cm
cat("The length separating the bottom 3% of nails:", round(bottom_3_percent, 2), "cm\n")
## The length separating the bottom 3% of nails: 3.21 cm

9. A psychology professor assigns letter grades on a test according to the following scheme.

  • A: Top 9% of scores
  • B: Scores below the top 9% and above the bottom 63%
  • C: Scores below the top 37% and above the bottom 17%
  • D: Scores below the top 83% and above the bottom 8%
  • F: Bottom 8% of scores

Scores on the test are normally distributed with a mean of 75.8 and a standard deviation of 8.1. Find the minimum score required for an A grade. Round your answer to the nearest whole number, if necessary.

i. Identify the distribution

This is a Normal distribution problem, as mentioned in the question.

ii. Identify the parameters

\(\mu=75.8, \sigma=8.1\)

iii. Minimum score required for an A grade

\(91^{st}\) percentile.

iv. Compute/Evaluate

Quintile function in R

ANS <-    qnorm(p    = .91,      # p is vector of probabilities.
                mean = 75.8, 
                sd   = 8.1
                ) 


round(x      = ANS,
      digits = 0
      )
## [1] 87

87

Visually -

# Define population mean and standard deviation
population_mean <- 75.8
population_sd <- 8.1

# Find the minimum score required for an A grade (91st percentile)
min_A_score <- qnorm(p = 0.91, mean = population_mean, sd = population_sd)

# Create a sequence of x values
x <- seq(from = population_mean - 3 * population_sd, 
         to = population_mean + 3 * population_sd, 
         length.out = 1000
         )

# Compute the corresponding y values (probability density function)
y <- dnorm(x, mean = population_mean, sd = population_sd)

# Plot the normal distribution
plot(x, y, type = "l", lwd = 2, main = "Normal Distribution of Test Scores", xlab = "Test Scores", ylab = "Probability Density")

# Shade the area corresponding to the top 9% of scores (A grade)
polygon(x = c(min_A_score, x[x >= min_A_score]), 
        y = c(0, y[x >= min_A_score]), 
        col = "lightblue"
        )

# Add text to label the shaded area
text(x = min_A_score + 2, y = max(y) * 0.2, labels = "A Grade", col = "blue")

# Add a vertical line to mark the minimum score for an A grade
abline(v = min_A_score, col = "blue", lty = 2)

10. Consider the probability that exactly 96 out of 155 computers will not crash in a day. Assume the probability that a given computer will not crash in a day is 61%. Approximate the probability using the normal distribution. Round your answer to four decimal places.

i. Identify the distribution

  • This is a binomial distribution problem, but we want to approximate it with a Normal distribution.

ii. Identify the parameters

Let random variable \(X\) measure the count of no crashes. Given \(\pi = 61\% = .61\) is the success, as usual (here - no computer crash). Also, given \(n = 155\), the number of trails.

n    <- 155
pi   <- .61

MEAN <- n * pi                     # approximately
SD   <- sqrt(pi * (1-pi) * n )     # approximately

iii. See what is the Probability Statement Required to be Solved so that you know what to compute

\(X\) measures no crash. \(P (X = 96 | \pi = .61, n=155 )\)

iv. Compute/Evaluate

# USE DNORM IF normal 
  # if normal, mean = n * p  ; sd = sqrt (var) = sqrt (p * (1-p) * n)
dnorm(x    = 96,         # 95.9:96.1 
      mean = .61 * 155, 
      sd   = sqrt(.61 * .39 * 155 )
      ) 
## [1] 0.06385071
# Alternatively approximate by centering around X=96 (add and subtract .5 from the point estimate - not the recommended approach)
pnorm(q = 96.5, mean = .61*155, sd = sqrt(.61 * .39 * 155) ) -  
  pnorm(q = 95.5, mean = .61*155, sd = sqrt(.61 * .39 * 155) )
## [1] 0.06378274
# USE DIBINOM IF binomial
dbinom( x    = 96, 
        size = 155, 
        prob = .61
        ) 
## [1] 0.06402352

Normal ( 0.06378274 ) approximates the binomial probability well in this case ( 0.06402352 ) as n is large - both np and n(1-p) are larger than 10.

Show n is large and \(\pi\) is small -

n  <- 155
p  <- .61

n * p       > 10
## [1] TRUE
n * (1-p)   > 10
## [1] TRUE

Visually, they are the same when large sample and small proportion -

# Define parameters
n <- 155
pi <- 0.61

# Compute mean and standard deviation for the normal approximation
mean_normal <- n * pi
sd_normal <- sqrt(pi * (1 - pi) * n)

# Create a sequence of x values
x <- seq(from = 0, to = n, by = 1)

# Compute the corresponding probabilities for the binomial distribution
prob_binomial <- dbinom(x, size = n, prob = pi)

# Compute the corresponding probabilities for the normal distribution approximation
prob_normal <- dnorm(x, mean = mean_normal, sd = sd_normal)

# Set up the grid layout
par(mfrow = c(2, 1))

# Plot binomial distribution
plot(x, prob_binomial, type = "h", lwd = 2, col = "blue",
     main = "Binomial Distribution",
     xlab = "Number of Computers Not Crashing",
     ylab = "Probability")

# Plot normal approximation
plot(x, prob_normal, type = "h", lwd = 2, col = "red",
     main = "Normal Approximation",
     xlab = "Number of Computers Not Crashing",
     ylab = "Probability")