library("psych")
library("dplyr")

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Question 1

df <- read.csv("/Users/pin.lyu/Desktop/challenger.csv")

describe(df)

##              vars  n  mean   sd median trimmed  mad  min  max range  skew
## launch          1 23 12.00 6.78   12.0   12.00 8.90  1.0 23.0    22  0.00
## temp            2 23 69.02 6.97   69.8   69.33 5.34 53.6 80.6    27 -0.40
## incident*       3 23  1.30 0.47    1.0    1.26 0.00  1.0  2.0     1  0.80
## o_ring_probs    4 23  0.43 0.79    0.0    0.26 0.00  0.0  3.0     3  1.81
##              kurtosis   se
## launch          -1.36 1.41
## temp            -0.44 1.45
## incident*       -1.42 0.10
## o_ring_probs     2.69 0.16

Launch: Ordinal

Temp: Interval

Incident: Nominal

o_ring_probs: Ratio

hist(df$o_ring_probs, xlab = "Launch", ylab = "Counts of failures", main = "Total counts of of O-ring partial failures experienced per flight")

```
boxplot(df$temp~df$incident, xlab = "Incident (Yes/No)", ylab = "Temperature")
```
As we can see from this graph, we have more failures at lower temperatures. Thus, perhaps colder temperature contributes to the likelihood of o-ring partial failures in flight. And given that on the day of the Challenger launch was 36 degrees Fahrenheit which is way below 70F where we see most no-incident launch occur, launching the shuttle at this temperature can increase the probability of o-ring failure.

df |>
  arrange(df$o_ring_probs)

##    launch temp incident o_ring_probs
## 1       5 66.2       No            0
## 2       6 66.2       No            0
## 3       7 66.2       No            0
## 4       8 66.2       No            0
## 5       9 66.2       No            0
## 6      10 68.0       No            0
## 7      12 69.8       No            0
## 8      14 69.8       No            0
## 9      15 69.8       No            0
## 10     16 71.6       No            0
## 11     17 73.4       No            0
## 12     19 75.2       No            0
## 13     20 75.2       No            0
## 14     21 78.8       No            0
## 15     22 78.8       No            0
## 16     23 80.6       No            0
## 17      2 57.2      Yes            1
## 18      3 57.2      Yes            1
## 19      4 62.6      Yes            1
## 20     11 69.8      Yes            1
## 21     13 69.8      Yes            1
## 22     18 75.2      Yes            2
## 23      1 53.6      Yes            3

As we can see from above, given the temperature and the status of incident, launch number 5 was the first successful launch occurred.

df_new <- df[df$temp > 65, ]
output <- nrow(df_new[df_new$incident == "Yes", ])
output

## [1] 3

Question 2

P(+|L) = 0.59, liars and tested positive

P(-|NL) = 0.9, non-liars tasted as non-liars

p(L) = 0.2, possibility of liars

\[ P(L|+)= \frac{P(+|L)P(L)}{P(+)} \]

pos <- 0.59
negative <- 0.90
p <- 0.2 # expected population of liars
p_prime <- 1 - p # expected population of non-liars


result <- (pos*p)/((pos*p)+(p_prime*(1-negative)))
round(result,4)

## [1] 0.596

P(L) = 0.2

P(+) = 0.198

\[ P(L ∩ +) = P(L) + P(+) - P(L+) \]

pL <- 0.2
p_plus <- 0.198

result <- pL+p_plus-(pL*p_plus)
result

## [1] 0.3584

Question 3

Poisson

lambda <- 1 * 8/10
x <- 1

ppois( q = x,
       lambda = lambda,
       lower.tail = F
  
)

## [1] 0.1912079

# Expected value
lambda

## [1] 0.8

# Standard deviation

sd <- sqrt(lambda)
sd

## [1] 0.8944272

Binomial

# Parameters
n <- 10
x <- 8
p <- 1/10   

pbinom( q = x,
        size = n,
        prob = p,
        lower.tail = FALSE
)

## [1] 9.1e-09

# Expected value
ex <- n*p
ex

## [1] 1

# Standard deviation
sd <- n*p*(1-p)
sd

## [1] 0.9

Question 4

# parameters
wrong_prob <- 3/4. # 3 ways to get wrong answers
right_prob <- 1/4  # 1 way to get the right answer

result <- (wrong_prob^2)*(right_prob)

Model chosen: Binomial

Only two possible results (Right/Wrong)
Results are mutually exclusive
# of count questions was correct or wrong
Each question’s probability of being right or wrong is independent from other 4 questions

#probability statements
n <- 5
x1 <- 3
x2 <- 4
p <- 1/4

# Getting exactly 3 right 
lower <- dbinom( x = x1,
        size = n,
        prob = p
)
# Getting exactly 4 right 
higher <- dbinom( x = x2,
        size = n,
        prob = p
)

# Probability of getting, in total, 3 or 4 questions correct
result <- lower + higher  #(A or B) = P(A)+P(B)
result

## [1] 0.1025391

majority threshold is at getting 3 correct, so we are trying to find getting >= 3 questions correct. due to our model is binomial, therefore, the x must equal to 2.

# Probability statement 
n <- 5
x <- 2
p <- 1/4

# method 1 "cdf with "lower.tail = FALSE"
pbinom( q = x,
        size = n,
        prob = p,
       lower.tail = F
)

## [1] 0.1035156

# method 2 "pdf approach"

a <- dbinom(x = 3,
        size = n,
        prob = p
)

b <- dbinom(x = 4,
        size = n,
        prob = p
)

c <- dbinom(x = 5,
        size = n,
        prob = p
)

a+b+c

## [1] 0.1035156

Question 5

# Probability statement 
std <- 4.78
mu <- 72.6

# (x < 80), therefore, lower.tail = TRUE
pnorm(q = 80,
      mean = mu,
      sd = std,
      lower.tail = T
)

## [1] 0.939203

# (68 < x < 78)
lower <- pnorm(q = 68,
      mean = mu,
      sd = std,
      lower.tail = T
)

upper <- pnorm(q = 78,
      mean = mu,
      sd = std,
      lower.tail = T
)

result <- upper - lower
result

## [1] 0.7027615

# This answer does make sense, becasue our mean = 72.6 and 68 miles/hr to 78 miles/hr is about a little over 1 standard deviation away from the mean. Within one standard deviation away from the mean, 68% of the data should be included. And here we have 70% which justifies that we are a little more one standard deviation away from the mean. Therefore, the answer makes sense.

a3)

# (x > 70) -> lower.tail = FALSE
pnorm(q = 70,
      mean = mu,
      sd = std,
      lower.tail = F
)

## [1] 0.7067562

b1)

# Parameter setup (men)
mu_m <- 4313
std_m <- 583

# top 5% means x>95%, lower.tail = FALSE
qnorm(p = 0.95,
      mean = mu_m,
      sd = std_m,
      lower.tail = F
)

## [1] 3354.05

b2)

# Parameter setup (women)
mu_w <- 5261
std_w <- 807

# bottom 10%
qnorm(p = 0.10,
      mean = mu_m,
      sd = std_m,
      lower.tail = T
)

## [1] 3565.855

Bonus Question 6

mymatrix = matrix(data = c(7,5,12,0,2,2,7,7,14), nrow=3,byrow=F)
colnames(mymatrix) = c("Survived","Died","Total")
rownames(mymatrix) = c("Drug1","Drug2","Total")
mymatrix

##       Survived Died Total
## Drug1        7    0     7
## Drug2        5    2     7
## Total       12    2    14

1. & b)

prop.table(x = mymatrix, margin = 2)

##        Survived Died Total
## Drug1 0.2916667  0.0  0.25
## Drug2 0.2083333  0.5  0.25
## Total 0.5000000  0.5  0.50

Mutually exclusive events cannot be independent unless the probability of one of the events is zero. the two possible outcomes of the Independent events can still happen, however, mutually exclusive means that when one happens the other outcome cannot happen at all.d)

# Set the mean and standard deviation
mu    <- 75
sigma <- 3

# Generate a range of values around the mean
x <- seq(from       =  mu - 3*sigma,
         to         =  mu + 3*sigma, 
         length.out =  1000
         )

# Calculate the probability density function
pdf <- dnorm(x    = x, 
             mean = mu, 
             sd   = sigma
             )

# Plot the normal distribution

plot(x    = x, 
     y    = pdf,
     type = 'l', 
     col  = 'red', 
     lwd  = 2, 
     xlab = 'Grades', 
     ylab = 'Density',
     main = 'Normal Distribution of Test score with Mean 75 and SD 3'
     )

DA_Midterm

Pin Lyu

2023-10-24