ADAN7310 - Midterm

QUESTION 1

# Set Directory
setwd("/Users/Ryan McNulty/Documents/Data Analysis/DATA/challenger")

# Read file in
chal <- read.csv(
  file = "challenger.csv", # File name
  na.strings = c("", "NA") # Replace blanks with NA
)

chal

##    launch temp incident o_ring_probs
## 1       1 53.6      Yes            3
## 2       2 57.2      Yes            1
## 3       3 57.2      Yes            1
## 4       4 62.6      Yes            1
## 5       5 66.2       No            0
## 6       6 66.2       No            0
## 7       7 66.2       No            0
## 8       8 66.2       No            0
## 9       9 66.2       No            0
## 10     10 68.0       No            0
## 11     11 69.8      Yes            1
## 12     12 69.8       No            0
## 13     13 69.8      Yes            1
## 14     14 69.8       No            0
## 15     15 69.8       No            0
## 16     16 71.6       No            0
## 17     17 73.4       No            0
## 18     18 75.2      Yes            2
## 19     19 75.2       No            0
## 20     20 75.2       No            0
## 21     21 78.8       No            0
## 22     22 78.8       No            0
## 23     23 80.6       No            0

## 1A
library(psych)
describe(chal)

##              vars  n  mean   sd median trimmed  mad  min  max range  skew
## launch          1 23 12.00 6.78   12.0   12.00 8.90  1.0 23.0    22  0.00
## temp            2 23 69.02 6.97   69.8   69.33 5.34 53.6 80.6    27 -0.40
## incident*       3 23  1.30 0.47    1.0    1.26 0.00  1.0  2.0     1  0.80
## o_ring_probs    4 23  0.43 0.79    0.0    0.26 0.00  0.0  3.0     3  1.81
##              kurtosis   se
## launch          -1.36 1.41
## temp            -0.44 1.45
## incident*       -1.42 0.10
## o_ring_probs     2.69 0.16

launch: ordinal ID variable
temp: interval quantitative but zero not lowest
Incident: Nominal because it’s categorical
o_ring_probs:ratio quantitative with zero lowest (counts)

hist(chal$o_ring_probs,
      main="O Ring Problem Histogram",
      xlab="Count of Problems",
      xlim=c(0,5),
      col="darkmagenta",
      freq=FALSE)

boxplot(
  chal$temp~chal$incident, 
  notch=FALSE, 
  horizontal=F
)

A lower environmental temperature might have been a concern because the incidents all occurred with lower temperature readings. Maybe the low temps affected the sealing on the o-rings.

o <- chal[chal$incident == 'No', ]  
o

##    launch temp incident o_ring_probs
## 5       5 66.2       No            0
## 6       6 66.2       No            0
## 7       7 66.2       No            0
## 8       8 66.2       No            0
## 9       9 66.2       No            0
## 10     10 68.0       No            0
## 12     12 69.8       No            0
## 14     14 69.8       No            0
## 15     15 69.8       No            0
## 16     16 71.6       No            0
## 17     17 73.4       No            0
## 19     19 75.2       No            0
## 20     20 75.2       No            0
## 21     21 78.8       No            0
## 22     22 78.8       No            0
## 23     23 80.6       No            0

The first successful launch was the 5th one.

a <- chal[chal$temp > 65, ]
a <- a[a$incident == 'Yes', ]
i <- nrow(a)
i

## [1] 3

There were 3 incidents above 65 degrees F.

QUESTION 2

# Define variables
# Probability of a
a <- 0.2
# Probability (b | a) / Sensitivity
bGivena <- 0.59
# Probability (¬b | ¬a) / Specificity
notbGivenNota <- 0.9

# Calculate the rest of the values based upon the 3 variables above
notA            <-    1 - a
notbGivena      <-    1 - bGivena
bGivenNota      <-    1 - notbGivenNota

# Joint Probabilities of a and B, a and notb, nota and b, nota and notb
aANDb          <-   a    *  bGivena
aANDnotb       <-   a    *  notbGivena
notaANDb       <-   notA *  bGivenNota
notaANDnotb    <-   notA *  notbGivenNota

# Probability of B
b <- aANDb + notaANDb
notB <-   1 - b

# Bayes theorem - probability of A | B
# Prob (a | b) = Prob (a AND b) / Prob (b)
aGivenb <- aANDb / b
ans <- round(aGivenb,3)
cat("The probability the user is actually a liar is",ans)

## The probability the user is actually a liar is 0.596

2B Probability that a user is either a liar or identified as a liar P(A|B) = P(Liar|Detected) = Prob (a AND b) / Prob (b)

library(BiocManager)
library("Rgraphviz")

## Loading required package: graph

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min

## Loading required package: grid

#############################################
## 3. START CODING ELEMENTS FOR TREE FOR GRAPH 
#############################################

### NODES (labels)
# These are the labels of the nodes on the graph
# To signify "Not A" - we use A' or A prime 

node1     <-  "P"
node2     <-  "Liar"      #A
node3     <-  "Truther'"     #A'
node4     <-  "Liar&Detected"    #A&B
node5     <-  "Liar&NotDetected'"   #A&B'
node6     <-  "Truther&Detected"   #A'&B
node7     <-  "Truther&NotDetected"  #A'&B'
nodeNames <- c(node1, node2, node3, node4, node5, node6, node7)

rEG   <- new("graphNEL", 
             nodes = nodeNames, 
             edgemode="directed"
             )

# Erase any existing plots
# dev.off()


### LINES
# Draw the "lines" or "branches" of the probability Tree
rEG  <- addEdge (nodeNames[1], nodeNames[2], rEG, 1)
rEG  <- addEdge (nodeNames[1], nodeNames[3], rEG, 1)
rEG  <- addEdge (nodeNames[2], nodeNames[4], rEG, 1)
rEG  <- addEdge (nodeNames[2], nodeNames[5], rEG, 1)
rEG  <- addEdge (nodeNames[3], nodeNames[6], rEG, 1)
rEG  <- addEdge (nodeNames[3], nodeNames[7], rEG, 10)

eAttrs  <- list()

q    <-  edgeNames(rEG)



### PROBABILITY VALUES
# Add the probability values to the the branch lines

eAttrs$label <- c(toString(a),          toString(notA),
                  toString(bGivena),    toString(notbGivena),
                  toString(bGivenNota), toString(notbGivenNota)
                  )

names(eAttrs$label) <- c( q[1], q[2], q[3], q[4], q[5], q[6] )
edgeAttrs <- eAttrs


### COLOR
# Set the color, etc, of the tree
attributes <- list(node  = list(label    = "foo", 
                              fillcolor = "lightgreen", 
                              fontsize  = "15"
                              ),
                   edge  = list(color   = "red"),
                   graph = list(rankdir = "LR")
                   )


### PLOT
# Plot the probability tree using Rgraphvis
plot (rEG, edgeAttrs = eAttrs, attrs=attributes)
nodes(rEG)

## [1] "P"                   "Liar"                "Truther'"           
## [4] "Liar&Detected"       "Liar&NotDetected'"   "Truther&Detected"   
## [7] "Truther&NotDetected"

edges(rEG)

## $P
## [1] "Liar"     "Truther'"
## 
## $Liar
## [1] "Liar&Detected"     "Liar&NotDetected'"
## 
## $`Truther'`
## [1] "Truther&Detected"    "Truther&NotDetected"
## 
## $`Liar&Detected`
## character(0)
## 
## $`Liar&NotDetected'`
## character(0)
## 
## $`Truther&Detected`
## character(0)
## 
## $`Truther&NotDetected`
## character(0)

### PROBABILITY (labels)
#Add the probability values to the leaves of A&B, A&B', A'&B, A'&B'

text(500,420,       aANDb, cex = .8)
text(500,280,    aANDnotb, cex = .8)
text(500,160,    notaANDb, cex = .8)
text(500,30 , notaANDnotb, cex = .8)
text(340,440,   "(Detected | Liar)", cex = .8)
text(340,230,  "(Detected | Truther)", cex = .8)

#Write a table in the lower left of the probablites of A and B
text(80,50,  paste("P(Liar):"   ,a    ),                      cex = .9, col="darkgreen")
text(80,20,  paste("P(Truther):"  ,notA ),                      cex = .9, col="darkgreen")

text(240,50, paste("P(Detected):"   ,round(b,       digits = 2)), cex = .9)
text(240,20, paste("P(NotDetected):"  ,round(notB,    digits = 2)), cex = .9)

text(80,420, paste("P(Liar|Detected):" ,round(aGivenb, digits = 2)), cex = .9, col = "blue")

# I took these from the graph - it's displaying off in the markdown when it's fine in the preview
prob <- 0.118 + 0.082 + 0.08  # aANDb + aANDnotb + notaANDnotb
cat("The probability a individual is a liar or a detected as liar is", prob)

## The probability a individual is a liar or a detected as liar is 0.28

Question 3

Interval is 10 years 1/10 chance for failure every year

3A -Poisson

f <- .1
prob <- round(ppois(0,f)^8,4)
ev <- round(8*f,4)
sd <- round(sqrt(ev),4)

cat("The probability the machine fails after 8 years is", prob)

## The probability the machine fails after 8 years is 0.4493

cat("\nExpected Value is", ev)

## 
## Expected Value is 0.8

cat("\nStandard deviation is", sd)

## 
## Standard deviation is 0.8944

3B - Binomial

# Define variables
n <- 10   # Number of year
s <- 0.9 # Probability of success 
f <- 0.1

prob <- round(pbinom(0,8,f),4)
ev <- round(8 * s,4)
sd <- round(sqrt(n*s*f),4)

cat("The probability that the machine will fail after 8 years",prob,"\n")

## The probability that the machine will fail after 8 years 0.4305

cat("\nExpected Value is", ev)

## 
## Expected Value is 7.2

cat("\nStandard deviation is", sd)

## 
## Standard deviation is 0.9487

Question 4

q <- 5
prob_c <- .25
prob_w <- .75

third <- round(prob_w * prob_w * prob_c,4)
cat("The probability the 3rd question is the 1st correct", third)

## The probability the 3rd question is the 1st correct 0.1406

4B The random variable X is the number of questions Robin answers correctly

This is a binomial distribution 1) Two possible outcomes - correct or wrong 2) Outcomes are exclusive - no partial credit 3) Counting correct answers 4) Each is independent

P(3 <= X <= 4|n = 5, pi = .25)

n <- 5   # Number of trials
pi <- 0.25 # Probability of success
s <- 3:4 # Range of successes

# 
a <- round(sum(dbinom(x=s, size=n, prob= pi)),4)
cat("The probability is",a)

## The probability is 0.1025

4C Majority of questions right means 3/5 must be correct. P(X >= 3 |n = 5, pi = .25)

n <- 5   # Number of trials
pi <- 0.25 # Probability of success
s <- 3 # Range of successes

a <- round(pbinom(s,n,pi, lower.tail=FALSE),4)
cat("The probability is",a)

## The probability is 0.0156

b <- 1 - round(pbinom(s,n,pi, lower.tail=TRUE),4)
cat("\nThe probability is",b)

## 
## The probability is 0.0156

Question 5

5A1 X is the number of passenger vehicles P()

pop_m <- 72.6
pop_sd <- 4.78
success <- 0:80
upper <- 80

#### FOR GRAPHING 
#define upper and lower bound
population_mean     <-    pop_m
population_sd       <-    pop_sd
lower_bound   <-  population_mean - population_sd
upper_bound   <-  population_mean + population_sd

#Create a sequence of 1000 x values based on population mean and standard deviation
x <- seq(from   = -4, 
         to     = 4, 
         length = 1000) * population_sd + population_mean

#create a vector of values that shows the height of the probability distribution
#for each value in x
y <- dnorm(x    = x, 
           mean = population_mean, 
           sd   = population_sd)

#plot normal distribution with customized x-axis labels
plot(x = x,
     y = y, 
     type = "l", 
     lwd = 2, 
     axes = FALSE, 
     xlab = "", 
     ylab = ""
     )
sd_axis_bounds <- 5
axis_bounds <- seq(from = -sd_axis_bounds * population_sd + population_mean,
                     to =  sd_axis_bounds * population_sd + population_mean,
                     by = population_sd)

axis(side = 1, at = axis_bounds, pos = 0)

prob <- pnorm(q=upper,mean=pop_m,sd=pop_sd)
prob <- round(prob,4)
cat("The percent of passengers going less than 80 mph is",prob)

## The percent of passengers going less than 80 mph is 0.9392

5A2

pop_m <- 72.6
pop_sd <- 4.78
upper <- 78
lower <- 68

prob <- pnorm(q=upper,mean=pop_m,sd=pop_sd) - pnorm(q=lower,mean=pop_m,sd=pop_sd)
prob <- round(prob,4)
cat("The percent of passengers going between 68 and 78 is",prob)

## The percent of passengers going between 68 and 78 is 0.7028

This percentage (~70%) makes sense because in a normal distribution most of the data points are within 1-2 standard deviations of the mean, and these speeds are on either side of the average speed.

5A3

pop_m <- 72.6
pop_sd <- 4.78
lower <- 70

prob <- 1 - pnorm(q=lower,mean=pop_m,sd=pop_sd,lower.tail=FALSE)
prob <- round(prob,4)
cat("The percent of passengers going over 70 is",prob)

## The percent of passengers going over 70 is 0.2932

5B1

pop_m <- 4313
pop_sd <- 583
cutoff <- .05

prob <- round(qnorm(p=cutoff,mean=pop_m,sd=pop_sd),0)
cat("The cutoff time is (seconds)",prob)

## The cutoff time is (seconds) 3354

5B2

pop_m <- 5261
pop_sd <- 807
cutoff <- .90

prob <- round(qnorm(p=cutoff,mean=pop_m,sd=pop_sd),0)
cat("The cutoff time is (seconds)",prob)

## The cutoff time is (seconds) 6295