library(IS606)
##
## Welcome to CUNY IS606 Statistics and Probability for Data Analytics
## This package is designed to support this course. The text book used
## is OpenIntro Statistics, 3rd Edition. You can read this by typing
## vignette('os3') or visit www.OpenIntro.org.
##
## The getLabs() function will return a list of the labs available.
##
## The demo(package='IS606') will list the demos that are available.
library(dplyr)
z1 <- -1.13
blueA <- 1-pnorm(z1)
blueA
## [1] 0.8707619
normalPlot(mean = 0, sd = 1, bounds =c(-1.13, 4))
z2 <- 0.18
blueB <- pnorm(z2)
blueB
## [1] 0.5714237
normalPlot(mean = 0, sd = 1, bounds =c(-4, 0.18))
z3 <- 8
blueC <- 1 - pnorm(z3)
blueC
## [1] 6.661338e-16
normalPlot(mean = 0, sd = 1, bounds =c(8, 4))
z4 <- 0.5
blueD <- 2* (0.5 - pnorm(-1 * z4))
blueD
## [1] 0.3829249
lowerBound <- qnorm(.5-(blueD/2))
upperBound <- qnorm(.5+(blueD/2))
normalPlot(mean = 0, sd = 1, bounds =c(lowerBound, upperBound))
Men (30-34): N(μ = 4313, σ = 583) Women (25-29): N(μ = 5261, σ = 807)
meanM <- 4313
SDM <- 583
leo <- 4948
meanF <- 5261
SDF <- 807
mary <- 5513
leoZ <- (leo - meanM) / SDM
leoZ
## [1] 1.089194
maryZ <- (mary - meanF) / SDF
maryZ
## [1] 0.3122677
normalPlot(mean = 0, sd = 1, bounds = c(maryZ, leoZ))
A Z score of 1.09 tells us that Leo finished 1.09 standard deviations above the mean for his group while Mary’s time was 0.31 standard deviations above the mean.
Because a lower time is better, Mary had a better race, when compared to her peer group, than Leo. Because this is a normal distribution, and both finished with average times above the mean, the racer with the Z score closest to the mean (0) performed better in their peer group.
normalPlot(mean = 0, sd = 1, bounds = c(-4, leoZ))
leoP <- 1 - pnorm(leoZ)
leoP
## [1] 0.1380342
#Leo finished faster than 13.8% of his peer group
normalPlot(mean = 0, sd = 1, bounds = c(leoZ, 4))
maryP <- 1 - pnorm(maryZ)
maryP
## [1] 0.3774186
#Mary finished faster than 37.8% of her peer group
normalPlot(mean = 0, sd = 1, bounds = c(maryZ, 4))
Our answers to parts b-e would change as the normal probability calculations would be no longer apply. The use of Z scores to determine probabilities assumes/requires a normal distribution to provide any accuracy.
heights <- data.frame(c(54, 55, 56, 56, 57, 58, 58, 59, 60, 60, 60 ,61, 61, 62, 62, 63, 63, 63, 64, 65, 65, 67, 67, 69, 73))
colnames(heights) <- "hgt"
meanH <- mean(heights$hgt)
SDH <- sd(heights$hgt)
SDtest <- function(SDnum) {
count(heights, heights>=(meanH - SDH * SDnum) & heights<=(meanH + SDH * SDnum)) / 25}
#1SD
SDtest(1)
## heights >= (meanH - SDH * SDnum) & he... n
## 1 0.00 0.32
## 2 0.04 0.68
#2SD
SDtest(2)
## heights >= (meanH - SDH * SDnum) & he... n
## 1 0.00 0.04
## 2 0.04 0.96
#3SD
SDtest(3)
## heights >= (meanH - SDH * SDnum) & he... n
## 1 0.04 1
Yes, the heights approximately do follow the 68-95-99.7% rule with our test equating to 68%-96%-100% on a relatively small data set.
sim_hgt <- rnorm(length(heights$hgt), meanH, SDH)
qqnorm(sim_hgt)
qqline(sim_hgt)
qqnormsim(sim_hgt)
Yes, this data appears to come from a normal distribution. The normal probaility plot data points stay in nearly a straight line and have a similar appearance to the 9 simulated probability plots which assume the same number of observations in our data set but with the guarantee of a normal distribution.
p <- .02
n <- 10
(1 - p)^(n - 1) * p
## [1] 0.01667496
n = 100
b = (1 - p)**n
round(b, 4)
## [1] 0.1326
u = 1/p
u
## [1] 50
#We would expect to see an average of 50 transistors produced to results in one defect.
sd <- sqrt((1-p)/(p^2))
sd
## [1] 49.49747
#SD is 49.5
p <- 0.05
u <- 1/p
u
## [1] 20
#We would expect to see an average of 20 transistors produced to results in one defect.
sd <- sqrt((1 - p)/(p^2))
sd
## [1] 19.49359
#SD is 19.5
The mean and standard deviation decrease when the probability increases
boyP <- 0.51
k = 2
n = 3
facN <- factorial(n)
facK <- factorial (k)
facN_K <- factorial(n-k)
boys2 <- (facN / (facK * facN_K)) * boyP^k * (1 - boyP)^(n-k)
boys2
## [1] 0.382347
ex1 <- c("boy", "boy", "girl")
ex2 <- c("boy", "girl", "boy")
ex3 <- c("girl", "boy", "boy")
df <- data.frame(ex1,ex2,ex3)
df
## ex1 ex2 ex3
## 1 boy boy girl
## 2 boy girl boy
## 3 girl boy boy
add <- boyP * boyP * (1 - boyP) + boyP * (1 - boyP) * boyP + (1 - boyP) * boyP * boyP
add
## [1] 0.382347
#This value matches our previous calculation. More simply, however we could have done the following:
(boyP * boyP * (1 - boyP))*3
## [1] 0.382347
This would greatly add to the number of possible sequences that would match the given parameters, making it much more tedious to use the method in part b than the more efficient formula used in part a.
n <- 10 #number of attempts
k <- 3 #successful serves
p <- 0.15 #successful serve prob
nFac <- factorial(n-1)
kFac <- factorial(k-1)
tenth <- factorial(n - 1) / (factorial(k-1) * (factorial(n - k))) * p^k * (1-p)^(n-k)
tenth
## [1] 0.03895012
Becuase the serves are independent of each other, the probability that she successfully makes any one individual serve is 15%.