1A
# Set Directory
setwd("/Users/Ryan McNulty/Documents/Data Analysis/DATA/challenger")
# Read file in
chal <- read.csv(
file = "challenger.csv", # File name
na.strings = c("", "NA") # Replace blanks with NA
)
chal
## launch temp incident o_ring_probs
## 1 1 53.6 Yes 3
## 2 2 57.2 Yes 1
## 3 3 57.2 Yes 1
## 4 4 62.6 Yes 1
## 5 5 66.2 No 0
## 6 6 66.2 No 0
## 7 7 66.2 No 0
## 8 8 66.2 No 0
## 9 9 66.2 No 0
## 10 10 68.0 No 0
## 11 11 69.8 Yes 1
## 12 12 69.8 No 0
## 13 13 69.8 Yes 1
## 14 14 69.8 No 0
## 15 15 69.8 No 0
## 16 16 71.6 No 0
## 17 17 73.4 No 0
## 18 18 75.2 Yes 2
## 19 19 75.2 No 0
## 20 20 75.2 No 0
## 21 21 78.8 No 0
## 22 22 78.8 No 0
## 23 23 80.6 No 0
## 1A
library(psych)
describe(chal)
## vars n mean sd median trimmed mad min max range skew
## launch 1 23 12.00 6.78 12.0 12.00 8.90 1.0 23.0 22 0.00
## temp 2 23 69.02 6.97 69.8 69.33 5.34 53.6 80.6 27 -0.40
## incident* 3 23 1.30 0.47 1.0 1.26 0.00 1.0 2.0 1 0.80
## o_ring_probs 4 23 0.43 0.79 0.0 0.26 0.00 0.0 3.0 3 1.81
## kurtosis se
## launch -1.36 1.41
## temp -0.44 1.45
## incident* -1.42 0.10
## o_ring_probs 2.69 0.16
1B
launch: ordinal ID variable
temp: interval quantitative but zero not lowest
Incident: Nominal because it’s categorical
o_ring_probs:ratio quantitative with zero lowest (counts)
1C
hist(chal$o_ring_probs,
main="O Ring Problem Histogram",
xlab="Count of Problems",
xlim=c(0,5),
col="darkmagenta",
freq=FALSE)
1D
boxplot(
chal$temp~chal$incident,
notch=FALSE,
horizontal=F
)
A lower environmental temperature might have been a concern because the incidents all occurred with lower temperature readings. Maybe the low temps affected the sealing on the o-rings.
1E
o <- chal[chal$incident == 'No', ]
o
## launch temp incident o_ring_probs
## 5 5 66.2 No 0
## 6 6 66.2 No 0
## 7 7 66.2 No 0
## 8 8 66.2 No 0
## 9 9 66.2 No 0
## 10 10 68.0 No 0
## 12 12 69.8 No 0
## 14 14 69.8 No 0
## 15 15 69.8 No 0
## 16 16 71.6 No 0
## 17 17 73.4 No 0
## 19 19 75.2 No 0
## 20 20 75.2 No 0
## 21 21 78.8 No 0
## 22 22 78.8 No 0
## 23 23 80.6 No 0
The first successful launch was the 5th one.
1F
a <- chal[chal$temp > 65, ]
a <- a[a$incident == 'Yes', ]
i <- nrow(a)
i
## [1] 3
There were 3 incidents above 65 degrees F.
2A
# Define variables
# Probability of a
a <- 0.2
# Probability (b | a) / Sensitivity
bGivena <- 0.59
# Probability (¬b | ¬a) / Specificity
notbGivenNota <- 0.9
# Calculate the rest of the values based upon the 3 variables above
notA <- 1 - a
notbGivena <- 1 - bGivena
bGivenNota <- 1 - notbGivenNota
# Joint Probabilities of a and B, a and notb, nota and b, nota and notb
aANDb <- a * bGivena
aANDnotb <- a * notbGivena
notaANDb <- notA * bGivenNota
notaANDnotb <- notA * notbGivenNota
# Probability of B
b <- aANDb + notaANDb
notB <- 1 - b
# Bayes theorem - probability of A | B
# Prob (a | b) = Prob (a AND b) / Prob (b)
aGivenb <- aANDb / b
ans <- round(aGivenb,3)
cat("The probability the user is actually a liar is",ans)
## The probability the user is actually a liar is 0.596
2B Probability that a user is either a liar or identified as a liar P(A|B) = P(Liar|Detected) = Prob (a AND b) / Prob (b)
library(BiocManager)
library("Rgraphviz")
## Loading required package: graph
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: grid
#############################################
## 3. START CODING ELEMENTS FOR TREE FOR GRAPH
#############################################
### NODES (labels)
# These are the labels of the nodes on the graph
# To signify "Not A" - we use A' or A prime
node1 <- "P"
node2 <- "Liar" #A
node3 <- "Truther'" #A'
node4 <- "Liar&Detected" #A&B
node5 <- "Liar&NotDetected'" #A&B'
node6 <- "Truther&Detected" #A'&B
node7 <- "Truther&NotDetected" #A'&B'
nodeNames <- c(node1, node2, node3, node4, node5, node6, node7)
rEG <- new("graphNEL",
nodes = nodeNames,
edgemode="directed"
)
# Erase any existing plots
# dev.off()
### LINES
# Draw the "lines" or "branches" of the probability Tree
rEG <- addEdge (nodeNames[1], nodeNames[2], rEG, 1)
rEG <- addEdge (nodeNames[1], nodeNames[3], rEG, 1)
rEG <- addEdge (nodeNames[2], nodeNames[4], rEG, 1)
rEG <- addEdge (nodeNames[2], nodeNames[5], rEG, 1)
rEG <- addEdge (nodeNames[3], nodeNames[6], rEG, 1)
rEG <- addEdge (nodeNames[3], nodeNames[7], rEG, 10)
eAttrs <- list()
q <- edgeNames(rEG)
### PROBABILITY VALUES
# Add the probability values to the the branch lines
eAttrs$label <- c(toString(a), toString(notA),
toString(bGivena), toString(notbGivena),
toString(bGivenNota), toString(notbGivenNota)
)
names(eAttrs$label) <- c( q[1], q[2], q[3], q[4], q[5], q[6] )
edgeAttrs <- eAttrs
### COLOR
# Set the color, etc, of the tree
attributes <- list(node = list(label = "foo",
fillcolor = "lightgreen",
fontsize = "15"
),
edge = list(color = "red"),
graph = list(rankdir = "LR")
)
### PLOT
# Plot the probability tree using Rgraphvis
plot (rEG, edgeAttrs = eAttrs, attrs=attributes)
nodes(rEG)
## [1] "P" "Liar" "Truther'"
## [4] "Liar&Detected" "Liar&NotDetected'" "Truther&Detected"
## [7] "Truther&NotDetected"
edges(rEG)
## $P
## [1] "Liar" "Truther'"
##
## $Liar
## [1] "Liar&Detected" "Liar&NotDetected'"
##
## $`Truther'`
## [1] "Truther&Detected" "Truther&NotDetected"
##
## $`Liar&Detected`
## character(0)
##
## $`Liar&NotDetected'`
## character(0)
##
## $`Truther&Detected`
## character(0)
##
## $`Truther&NotDetected`
## character(0)
### PROBABILITY (labels)
#Add the probability values to the leaves of A&B, A&B', A'&B, A'&B'
text(500,420, aANDb, cex = .8)
text(500,280, aANDnotb, cex = .8)
text(500,160, notaANDb, cex = .8)
text(500,30 , notaANDnotb, cex = .8)
text(340,440, "(Detected | Liar)", cex = .8)
text(340,230, "(Detected | Truther)", cex = .8)
#Write a table in the lower left of the probablites of A and B
text(80,50, paste("P(Liar):" ,a ), cex = .9, col="darkgreen")
text(80,20, paste("P(Truther):" ,notA ), cex = .9, col="darkgreen")
text(240,50, paste("P(Detected):" ,round(b, digits = 2)), cex = .9)
text(240,20, paste("P(NotDetected):" ,round(notB, digits = 2)), cex = .9)
text(80,420, paste("P(Liar|Detected):" ,round(aGivenb, digits = 2)), cex = .9, col = "blue")
# I took these from the graph - it's displaying off in the markdown when it's fine in the preview
prob <- 0.118 + 0.082 + 0.08 # aANDb + aANDnotb + notaANDnotb
cat("The probability a individual is a liar or a detected as liar is", prob)
## The probability a individual is a liar or a detected as liar is 0.28
Interval is 10 years 1/10 chance for failure every year
3A -Poisson
f <- .1
prob <- round(ppois(0,f)^8,4)
ev <- round(8*f,4)
sd <- round(sqrt(ev),4)
cat("The probability the machine fails after 8 years is", prob)
## The probability the machine fails after 8 years is 0.4493
cat("\nExpected Value is", ev)
##
## Expected Value is 0.8
cat("\nStandard deviation is", sd)
##
## Standard deviation is 0.8944
3B - Binomial
# Define variables
n <- 10 # Number of year
s <- 0.9 # Probability of success
f <- 0.1
prob <- round(pbinom(0,8,f),4)
ev <- round(8 * s,4)
sd <- round(sqrt(n*s*f),4)
cat("The probability that the machine will fail after 8 years",prob,"\n")
## The probability that the machine will fail after 8 years 0.4305
cat("\nExpected Value is", ev)
##
## Expected Value is 7.2
cat("\nStandard deviation is", sd)
##
## Standard deviation is 0.9487
4A
q <- 5
prob_c <- .25
prob_w <- .75
third <- round(prob_w * prob_w * prob_c,4)
cat("The probability the 3rd question is the 1st correct", third)
## The probability the 3rd question is the 1st correct 0.1406
4B The random variable X is the number of questions Robin answers correctly
This is a binomial distribution 1) Two possible outcomes - correct or wrong 2) Outcomes are exclusive - no partial credit 3) Counting correct answers 4) Each is independent
P(3 <= X <= 4|n = 5, pi = .25)
n <- 5 # Number of trials
pi <- 0.25 # Probability of success
s <- 3:4 # Range of successes
#
a <- round(sum(dbinom(x=s, size=n, prob= pi)),4)
cat("The probability is",a)
## The probability is 0.1025
4C Majority of questions right means 3/5 must be correct. P(X >= 3 |n = 5, pi = .25)
n <- 5 # Number of trials
pi <- 0.25 # Probability of success
s <- 3 # Range of successes
a <- round(pbinom(s,n,pi, lower.tail=FALSE),4)
cat("The probability is",a)
## The probability is 0.0156
b <- 1 - round(pbinom(s,n,pi, lower.tail=TRUE),4)
cat("\nThe probability is",b)
##
## The probability is 0.0156
5A1 X is the number of passenger vehicles P()
pop_m <- 72.6
pop_sd <- 4.78
success <- 0:80
upper <- 80
#### FOR GRAPHING
#define upper and lower bound
population_mean <- pop_m
population_sd <- pop_sd
lower_bound <- population_mean - population_sd
upper_bound <- population_mean + population_sd
#Create a sequence of 1000 x values based on population mean and standard deviation
x <- seq(from = -4,
to = 4,
length = 1000) * population_sd + population_mean
#create a vector of values that shows the height of the probability distribution
#for each value in x
y <- dnorm(x = x,
mean = population_mean,
sd = population_sd)
#plot normal distribution with customized x-axis labels
plot(x = x,
y = y,
type = "l",
lwd = 2,
axes = FALSE,
xlab = "",
ylab = ""
)
sd_axis_bounds <- 5
axis_bounds <- seq(from = -sd_axis_bounds * population_sd + population_mean,
to = sd_axis_bounds * population_sd + population_mean,
by = population_sd)
axis(side = 1, at = axis_bounds, pos = 0)
prob <- pnorm(q=upper,mean=pop_m,sd=pop_sd)
prob <- round(prob,4)
cat("The percent of passengers going less than 80 mph is",prob)
## The percent of passengers going less than 80 mph is 0.9392
5A2
pop_m <- 72.6
pop_sd <- 4.78
upper <- 78
lower <- 68
prob <- pnorm(q=upper,mean=pop_m,sd=pop_sd) - pnorm(q=lower,mean=pop_m,sd=pop_sd)
prob <- round(prob,4)
cat("The percent of passengers going between 68 and 78 is",prob)
## The percent of passengers going between 68 and 78 is 0.7028
This percentage (~70%) makes sense because in a normal distribution most of the data points are within 1-2 standard deviations of the mean, and these speeds are on either side of the average speed.
5A3
pop_m <- 72.6
pop_sd <- 4.78
lower <- 70
prob <- 1 - pnorm(q=lower,mean=pop_m,sd=pop_sd,lower.tail=FALSE)
prob <- round(prob,4)
cat("The percent of passengers going over 70 is",prob)
## The percent of passengers going over 70 is 0.2932
5B1
pop_m <- 4313
pop_sd <- 583
cutoff <- .05
prob <- round(qnorm(p=cutoff,mean=pop_m,sd=pop_sd),0)
cat("The cutoff time is (seconds)",prob)
## The cutoff time is (seconds) 3354
5B2
pop_m <- 5261
pop_sd <- 807
cutoff <- .90
prob <- round(qnorm(p=cutoff,mean=pop_m,sd=pop_sd),0)
cat("The cutoff time is (seconds)",prob)
## The cutoff time is (seconds) 6295