rm(list = ls()) #remove all objects from the environment

cat("\f") # clear the console

graphics.off()  # clear all graphs

Import the Packages Needed

library(psych)
library(moments)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

Import the Data

library(readr)

challenger_1 <- read_csv("challenger-1.csv")

## Rows: 23 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): incident
## dbl (3): launch, temp, o_ring_probs
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Question 1.

a. Get summary stats using psych

describe(challenger_1) #provides summary stats for the dataframe

##              vars  n  mean   sd median trimmed  mad  min  max range  skew
## launch          1 23 12.00 6.78   12.0   12.00 8.90  1.0 23.0    22  0.00
## temp            2 23 69.02 6.97   69.8   69.33 5.34 53.6 80.6    27 -0.40
## incident*       3 23  1.30 0.47    1.0    1.26 0.00  1.0  2.0     1  0.80
## o_ring_probs    4 23  0.43 0.79    0.0    0.26 0.00  0.0  3.0     3  1.81
##              kurtosis   se
## launch          -1.36 1.41
## temp            -0.44 1.45
## incident*       -1.42 0.10
## o_ring_probs     2.69 0.16

b. Levels of Measurement:

str(challenger_1)

## spc_tbl_ [23 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ launch      : num [1:23] 1 2 3 4 5 6 7 8 9 10 ...
##  $ temp        : num [1:23] 53.6 57.2 57.2 62.6 66.2 66.2 66.2 66.2 66.2 68 ...
##  $ incident    : chr [1:23] "Yes" "Yes" "Yes" "Yes" ...
##  $ o_ring_probs: num [1:23] 3 1 1 1 0 0 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   launch = col_double(),
##   ..   temp = col_double(),
##   ..   incident = col_character(),
##   ..   o_ring_probs = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Launch: Launch is qualitative and nominal. Each value represents a launch and there is no value in conducting mathematical analyses on these numbers.

Temp: Temperature is a quantitative variable that is of the interval level of measurement. Temperature is measured on a scale of equal sized units and the values have order. It is also considered a continuous variable because temperature comes in decimals (not just whole numbers). It is NOT a ratio because there is no meaningful 0 point on the Fahrenheit scale.

Incident: Incident is a qualitative and nominal or categorical level of measurement. More specifically it is a binary nominal variable (only 2 states of yes and no). Yes and no are considered categories, classifying the observation into yes incident or no incident bins.

O_ring_probs: O ring probs is a quantitative and ratio level of measurement. It is quantitative because it is measuring the number of times something occurred and has real numerical value. It is ratio data because there is a meaningful 0 point (cannot have less than 0 mistakes), and each number is equally spaced in terms of value. We can talk about the values of being an order of magnitude of another value.

c. Provide a graph of the O Ring Probs

par(mfrow = c(1,2))

boxplot(challenger_1$o_ring_probs, horizontal = F)

hist(challenger_1$o_ring_probs,
     main = "O Ring Problems Histogram",
     xlab = "Number of Problems",
     col  = 'red')

d. Create side-by-side boxplots

boxplot(challenger_1$temp ~ challenger_1$incident,
        ylab = "Temperature")

challenger_incident <- challenger_1 |>
  dplyr :: filter(incident == "Yes")

challenger_no_incident <- challenger_1 |> 
  dplyr :: filter(incident == "No")

describe(challenger_incident)

##              vars n  mean   sd median trimmed   mad  min  max range skew
## launch          1 7  7.43 6.55    4.0    7.43  4.45  1.0 18.0  17.0 0.43
## temp            2 7 63.63 8.11   62.6   63.63 10.67 53.6 75.2  21.6 0.13
## incident*       3 7  1.00 0.00    1.0    1.00  0.00  1.0  1.0   0.0  NaN
## o_ring_probs    4 7  1.43 0.79    1.0    1.43  0.00  1.0  3.0   2.0 1.08
##              kurtosis   se
## launch          -1.71 2.48
## temp            -1.86 3.06
## incident*         NaN 0.00
## o_ring_probs    -0.62 0.30

describe(challenger_no_incident)

##              vars  n  mean   sd median trimmed  mad  min  max range  skew
## launch          1 16 14.00 6.02   14.5   14.00 8.15  5.0 23.0  18.0 -0.01
## temp            2 16 71.38 5.04   69.8   71.09 5.34 66.2 80.6  14.4  0.48
## incident*       3 16  1.00 0.00    1.0    1.00 0.00  1.0  1.0   0.0   NaN
## o_ring_probs    4 16  0.00 0.00    0.0    0.00 0.00  0.0  0.0   0.0   NaN
##              kurtosis   se
## launch          -1.56 1.51
## temp            -1.31 1.26
## incident*         NaN 0.00
## o_ring_probs      NaN 0.00

The temperature on the day of the challenger accident was 36 degrees F. This is concerning. As we can see from the boxplots, the launches that experienced an incident had a lower mean temperature range. The mean temperature for incident launches was 63.63 F. The mean temperature for the no incident launches was 71.38. The lowest temperature launched was 53.6 F and had an incident. There are also no low temperature outliers for not having an incident, where there are for having one. Launching at 36 degrees, which is considerably lower than the next lowest launch, with data supporting an increased risk of an incident in colder weather gives cause for concern about launching that day.

e. First observation of no incident

which(challenger_1$incident == "No") #these are the indexes of rows where no incident occured

##  [1]  5  6  7  8  9 10 12 14 15 16 17 19 20 21 22 23

which(challenger_1$incident == "No")[1]  #give the index of only the first occuring "No"

## [1] 5

The observation that had the first successful launch occur was observation (or row) 5.

f. How many incidents occurred above 65 degrees F?

challenger_f <- challenger_incident |> 
  dplyr :: filter(temp > 65)

nrow(challenger_f)

## [1] 3

There are 3 launches where incidents occurred above 65 degrees.

sum(challenger_f$o_ring_probs)

## [1] 4

There are 4 incidents that occurred above 65 degrees.

Problem 2.

Define Some Variables

liar <- .2 #P of being a liar
N_liar <- .8 #P of not being a liar

Positive_G_Liar <- .59 #P of testing positive given a liar
Negative_G_Liar <- .41

Positive_and_Liar <- Positive_G_Liar * liar
Negative_and_Liar <- Negative_G_Liar * liar

Positive_G_NotLiar <- .1
Negative_G_NotLiar <- .9

Positive_and_NotLiar <- Positive_G_NotLiar * N_liar
Negative_and_NotLiar <- Negative_G_NotLiar * N_liar

\(P(A|B) = \frac{P(B|A) * P(A)}{P(B)}\)

A = Liar

B = Positive on the Polygraph.

#compute

(Positive_and_Liar) / (Positive_and_Liar + Positive_and_NotLiar)

## [1] 0.5959596

The probability that an individual is actually a liar given a positive polygraph is .596 or 59.6%.

Probability of randomly selected individual is either a liar or was identified as a liar by the polygraph.

\(P(Liar \cup +)\)

This is a union probability. It is asking for Either a liar OR identified as one by the polygraph. Since these events are not mutually exclusive we must use the formula:

\(P(Liar) + P(+) - P(Liar \cap +)\)

#compute
liar + Positive_and_Liar + Positive_and_NotLiar - Positive_and_Liar

## [1] 0.28

28% chance that a randomly selected person is either a liar or was identified as a liar by the polygraph.

Question 3.

a. Machine fails once every 10 years

Identify the Parameters:

\(t = 8\)

\(\lambda = 1/10\)

\(\lambda t = 8/10\)

Probability Statement:

Let X be the year the MRI fails.

plot(x = 0:15,
     y = dpois(x = 0:15, 
               lambda = .8),
     type = 'h',
     ylab = "Probability",
     xlab = "Number of failures in 8 year interval")

dpois(x = 0, lambda = .8)

## [1] 0.449329

#Standard deviation
print(sqrt(.8))

## [1] 0.8944272

# is .8944

#Expected value is lambda
#here it is .8 failures expected in 8 years.

There is a .4493 or a 44.93% chance that the machine will fail after 8 years.

B. Binomial

Parameters:

\(n = 8\)

\(\pi = 1/10\)

plot(x = 0:8,
     y = dbinom(x = 0:8,
                size = 8,
                prob = .1),
     type = 'h')

dbinom(x = 0, size = 8, prob = .1)

## [1] 0.4304672

#standard deviation
sd_threeB <- sqrt(8*.1*.9)
print(sd_threeB)

## [1] 0.8485281

#Expected Value
print(8 * .1)

## [1] 0.8

According to the binomial distribution, there is a .4305 or a 43.05% chance the machine fails after 8 years.

Question 4.

a. What is the P that Robin’s first correct question is #3.

We can use the multiplication rule for independent events. If she is just randomly guessing we can assume that the outcome for each question is independent of one another. Also assuming there is equal likeliness of her choosing a,b,c,or d and also that the answers are equally as likely to be a, b, c, or d we can say the P of getting a question right is .25 and the P of getting a question wrong is .75.

#in order for Robin to have the first correct question be #3, she need to get #1 and #2 wrong and then #3 correct. 

Four_a <- (.75) * (.75) * (.25)
print(Four_a)

## [1] 0.140625

There is a 14.06% chance that the first question Robin gets right is the 3rd question.

Identify the Distribution. This is a binomial distribution. This distribution is used to describe the probability/number of successes in a repeated trials of an experiment where each trial is independent of one another. The binomial distribution has 2 possible outcomes; success and failure. In this problem a defective bulb is a success and a effective bulb is a failure.

Check the assumptions:
- There are only two possible outcomes on an outcomes on one trial of the experiment (correct or incorrect answer)
- The outcomes are mutually exclusive (cannot be right and wrong at the same time)
- Random variable is the result of counts (X = how many correct answers)
- Each trial is independent of each other

II. Identify Parameters:

\(n = 5\) : number of questions on the quiz

\(\pi = .25\) : 4 choices per question gives a .25 chance of getting a correct answer with guessing.

III. Probability Statement

Let X be the count of correct answers on the quiz.

In Words: Probability that Robin gets exactly 3 or 4 questions right.

In Math: \(P(3 \le X \le 4 | n = 5, \pi = .25)\)

IV. Compute

plot(x   = 0:5,
     y   = dbinom(x    = 0:5,
                  size = 5,
                  prob = .25),
     type = 'h',
     main = "Binomial Dist. (n = 5, pi = .25)",
     ylab = "Probability",
     xlab = "Number of Correct Questions")

dbinom(x = 3:4, size = 5, prob = .25)

## [1] 0.08789063 0.01464844

sum(round( x = dbinom(x = 3:4, size = 5, prob = .25), digits = 4))

## [1] 0.1025

There is a 10.25% chance that Robin gets exactly 3 or 4 questions correct on the quiz.

c. Probability that Robin gets the majority of the questions right?

Check the assumptions:
- There are only two possible outcomes on an outcomes on one trial of the experiment (correct or incorrect answer)
- The outcomes are mutually exclusive (cannot be right and wrong at the same time)
- Random variable is the result of counts (X = how many correct answers)
- Each trial is independent of each other

II. Identify Parameters:

\(n = 5\) : number of questions on the quiz

\(\pi = .25\) : 4 choices per question gives a .25 chance of getting a correct answer with guessing.

III. Probability Statement

Let X be the count of correct questions on the quiz.

In Words: Probability that Robin gets majority of the questions correct.

In Math: \(P(3 \le X | n = 5, \pi = .25)\)

IV. Compute

plot(x   = 0:5,
     y   = dbinom(x    = 0:5,
                  size = 5,
                  prob = .25),
     type = 'h',
     main = "Binomial Dist. (n = 5, pi = .25)",
     ylab = "Probability",
     xlab = "Number of Correct Questions")

dbinom(x = 3:5, size = 5, prob = .25)

## [1] 0.0878906250 0.0146484375 0.0009765625

#Compute using the PMF
sum(round(x = dbinom(x = 3:5, size = 5, prob = .25), digits = 4))

## [1] 0.1035

#Compute using the CMF
1 - pbinom(q = 2, size = 5, prob = .25)

## [1] 0.1035156

Probaility that Robin gets majority of the questions right is .1035 or 10.35%.

Question 5.

A. Normal Distribution with a \(\mu = 72.6\) and \(\sigma = 4.78\)

#graph
plot(x  = 0:100,
     y  = dnorm(x = 0:100,
                mean = 72.6, 
                sd = 4.78),
     type = 'l',
     main = "Normal Dist.",
     ylab = "probability",
     xlab = "speed")

5A1.

#Percent of vehicles that travel slower than 80mph

pnorm(q = 80, mean = 72.6, sd = 4.78)

## [1] 0.939203

93.92% vehicles travel slower than 80mph

5A2.

#Percent vehicles that travel bewtween 68 and 78 mph
(pnorm(q = 78, mean = 72.6, sd = 4.78)) - (pnorm(q = 68, mean = 72.6, sd = 4.78))

## [1] 0.7027615

70.28% vehicles travel between 68 and 78 mph. This makes sense as it is close to +- 1 SD which would give a rough estimate of 68%.

5A3.

#Percent of vehicles that travel above the speed limit (70mph)

1 - pnorm(q = 70, mean = 72.6, sd = 4.78)

## [1] 0.7067562

70.68% vehicles travel above the speed limit of 70.

5B. Triathlon Times:

Men: Mean = 4313, SD = 583

Women: Mean = 5261, SD = 807

plot(x = 2500:6000,
     y = dnorm(x = 2500:6000,
               mean = 4314,
               sd = 583),
     type = 'l',
     main = "Men Normal Dist. Times")

plot(x = 2500:7000,
     y = dnorm(x = 2500:7000,
               mean = 5261,
               sd = 807),
     type ='l',
     main = "Womens Times Normal Dist.")

5B1. Find the cutoff for the 95th percentile. - really it is the 5th percentile because we want the fastest times. 95th in this scenerio would give us the longest 5% of times.

qnorm(.05, mean = 4313, sd = 583)

## [1] 3354.05

The cutoff for the fastest 5% of times is 3354.

5B2. Find the cuttoff time for the slowest 10% of women. Here we need to find the 90th percentile in our distribution because we want the longest times (on the right hand side of the graph).

qnorm(.90, mean = 5261, sd = 807)

## [1] 6295.212

The cutoff for the slowest 10% of athletes in the women’s group is 6295.

Midterm Data Analysis

Ryan OHara

2023-10-24

Question 1.

Problem 2.

Question 3.

Question 4.

Question 5.