# Probability of finding the liar via the polygraph
# Sensitivity of polygraph = A / A+C
# Specificity of polygraph = D / D+B
A <- 0.8
D <- 0.9
C <- 1-A
B <- 1-D
Liar <- A+C
Truth <- B+D
PredLiar <- A+B
PredTruth <- C+D
Total <- A+B+C+D
# Pure Positive
ProbA <- A/Total
#False Negative
ProbC <- C/Total
#False Positive
ProbB <- B/Total
# Pure Negative
ProbD <- D/Total
polygraph <- matrix(c(A,B,PredLiar, C,D,PredTruth,Liar,Truth,Total),ncol=3,byrow=TRUE)
colnames(polygraph) <- c("Liar", "Truth", "Total")
rownames(polygraph) <- c("Pure Liar", "Pure Truth", "Total")
polygraph <- as.table(polygraph)
polygraph
## Liar Truth Total
## Pure Liar 0.8 0.1 0.9
## Pure Truth 0.2 0.9 1.1
## Total 1.0 1.0 2.0
ALiar <- A/PredLiar
DTruth <- D/PredTruth
ProbLiar <- 0.3
cat("A. The probability that the individual is actually a liar given they have been detected as such is",
"\n",round(ALiar,2),
"\n", "B. The probability that the individual is actually telling the truth given they have been detected of doing so is",
"\n", round(DTruth,2),
"\n", "C. The probability of a randomly selcted individual being a true liar or being detected as a liar is",
"\n", ProbLiar+(PredLiar/Total))
## A. The probability that the individual is actually a liar given they have been detected as such is
## 0.89
## B. The probability that the individual is actually telling the truth given they have been detected of doing so is
## 0.82
## C. The probability of a randomly selcted individual being a true liar or being detected as a liar is
## 0.75
# P(x >=10|n=15,pi=1/15)
pi <- 1/15
mygeom <- dgeom(0:15,pi)
sum(mygeom[0:11])
## [1] 0.531829
# P(x>=10 | n=15, pi=1/15)
pi <-1/15
n<-15
sum(dbinom(10:15, 15, 1/15))
## [1] 3.810952e-09
#Expected Value
ex.binom <- n*pi
ex.binom
## [1] 1
#Standard Deviation
sd.binom = sqrt(n*pi*(1-pi))
sd.binom
## [1] 0.9660918
pi <- 1/15
N <- 15
n <- 10
t <- 1
lambda <- n*pi/t
pois <- dpois(1:100000,(lambda*t))
sum(pois)
## [1] 0.4865829
library(readxl)
challenger <- read_excel("C:/Users/earth/Downloads/challenger.xlsx")
View(challenger)
The variable launch is considered to be nominal because each observation is numeric, but they are identifiers for each particular event. They do not have or follow a particular order and the data cannot calculate a mean or median as a result of not representing particular measures.The variable temp is considered to be interval, as it is again a numeric value, however does not have a true zero value, as a result of having no absence of temperature. The variable incident is considered to be nominal as the numerical values are created to categorize the data of each event and they cannot be ranked as a result of not having measurable intervals. The variable o_ring_probs is considered to be ratio, as it’s numerical value helps to represent a true and equal interval, while also having a true zero. It can be used to find the mean and median of this part of the data.
library(psych)
describe(challenger$o_ring_probs)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 23 0.43 0.79 0 0.26 0 0 3 3 1.81 2.69 0.16
#Histogram
hist(challenger$o_ring_probs, main = "Frequency of O-Ring Failures", xlab = "Number of O-Ring Failures", ylab = "Frequency")
#Normal Distribution
mean.oring <- mean(challenger$o_ring_probs)
sd.oring <- sd(challenger$o_ring_probs)
curve(dnorm(x, mean=mean.oring, sd=sd.oring), xlab = "O-Ring Failures", ylab = "Distribution")
#Confidence Interval of 95%
p <- mean.oring
n <- length(challenger$o_ring_probs)
alpha = 0.05
z = qnorm(1-alpha/2)
se = sqrt(p*(1-p)/n)
lower = round((p-z*se),3)
upper = round((p+z*se),3)
cat("With a 95% confidence interval, launches will include between", lower, "and", upper, "O-Ring failures. Since the failure must be a whole number we can assume that at a 95% confidence interval that a lauch will most likely contain 0 to 1 O-Ring failures.")
## With a 95% confidence interval, launches will include between 0.232 and 0.637 O-Ring failures. Since the failure must be a whole number we can assume that at a 95% confidence interval that a lauch will most likely contain 0 to 1 O-Ring failures.
The following states the a few of the variables of descriptive statistics for o_ring_probs: - Mean: 0.43 - Standard Deviation: 0.79 -Median: 0 -Skew: 1.81 -Standard Error: 0.16 From the histogram it seems that the lower amount of O-ring Failures led to a higher frequency rate. The histogram seems to be skewed to the right. The normal distribution curve shows that within the range of 0.2 to 0.6 failures, there is a high distribution level and it reaches its peak. This curve also seems to be slightly skewed to the right but more centered.
boxplot(temp~o_ring_probs, data=challenger, main="Temperature at Time of the Launch vs Number of O-Ring Failures", xlab="Number of O-Ring Failures", ylab = "Temperature")
The box plot shows that between 0 to 1 O-ring failures occur within the range of 75 degrees and 57 degrees. There are no box plots for 2 and 3 O-ring failures because they only occur once. The only times they did occur they happened outside the upper quartile of about 75 degrees and the lower quartile of about 57 degrees Fahrenheit.
mean(iris$Sepal.Length)
## [1] 5.843333
sd(iris$Sepal.Length)
## [1] 0.8280661
install.packages("car", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/earth/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'car' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\earth\AppData\Local\Temp\Rtmpa0p2ww\downloaded_packages
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot( Sepal.Length ~ Petal.Length | Species, data=iris, xlab = "Sepal Length", ylab = "Petal Length", main = "Sepal vs Petal Length")
For all of the species, petal length continuously increases as sepal length increases. The Setosa flowers have the shortest petal and sepal length wile Virginica flowers have the longest. The Setosa flower seems to have more scattered data and not as much centered towards the line of best fit when it comes to sepal length and petal length compared to the Versicolor and the Virginica flowers.
N <- 10000
n <- 100
p <- 0.9
x <- rbinom(N,n,p)
hist(x,
xlim = c(min(x), max(x)),
probability = TRUE,
#bins
nclass = max(x) - min(x) + 1,
main = 'Salary Distribution (Age 4 and Over, Not Retired Yet)',
xlab = "Salary in Thousands")
lines(density(x,bw=1), lwd=3)
mean(x)
## [1] 89.9592
median(x)
## [1] 90
# mode
getmode <- function(y) {
uniqy <- unique(y)
uniqy[which.max(tabulate(match(y, uniqy)))]
}
getmode(x)
## [1] 90
The distribution of the graph is skewed to the left and the median and mode being slightly greater than the mean. With the distribution having a heavy upper tail, there are a smaller percentage of people who are observed to have a larger portion of the total amount of income.
B1 <- 12
B2 <- 2
A1B1 <- round(7/B1,2)
A1B2 <- round(7/B2,2)
A2B1 <- round(5/B1,2)
A2B2 <- round(5/B2,2)
pigs1 <- matrix(c(A1B1,A1B2,A2B1,A2B2),ncol=2,byrow=TRUE)
colnames(pigs1) <- c("Survived","Died")
rownames(pigs1) <- c("Drug 1","Drug 2")
pigs1 <- as.table(pigs1)
pigs1
## Survived Died
## Drug 1 0.58 3.50
## Drug 2 0.42 2.50
c.A1B1 <- round(7/14,2)
c.A1B2 <- round(0/14,2)
c.A2B1 <- round(5/14,2)
c.A2B2 <- round(2/14,2)
pigs2 <- matrix(c(c.A1B1,c.A1B2, c.A1B1+c.A1B2,c.A2B1, c.A2B2, c.A2B1+c.A2B2,c.A1B1+ c.A2B1, c.A1B2+c.A2B2, c.A1B1+c.A1B2+c.A2B1+c.A2B2),ncol=3,byrow=TRUE)
colnames(pigs2) <- c("Survived","Died","Total")
rownames(pigs2) <- c("Drug 1", "Drug 2", "Total")
pigs2 <- as.table(pigs2)
pigs2
## Survived Died Total
## Drug 1 0.50 0.00 0.50
## Drug 2 0.36 0.14 0.50
## Total 0.86 0.14 1.00
# Independent then P(AIBJ) == P(AI)*P(BJ)
A1 <- 7/14
A2 <- 7/14
B1 <- 12/14
B2 <- 2/14
#A1B1
t1 <- 0.58 == (A1*B1)
m1 <- round(A1*B1,2)
t2 <- 0 == (A1*B2)
m2 <- round(A1*B2,2)
t3 <- 0.42 == (A2*B1)
m3 <- round(A2*B1,2)
t4 <- 1 == (A2*B2)
m4 <- round(A2*B2,2)
cat("True or False:
P(A1B1)=P(A1)*P(B1)?",t1,"\n",
"P(A1B2)=P(A1)*P(B2)?",t2,"\n",
"P(A2B1)=P(A2)*P(B1)?",t3,"\n",
"P(A2B2)=P(A2)*P(B2)?",t4,"\n",
"What is P(A1)*P(B1) equal to?",m1,"\n",
"What is P(A1)*P(B2) equal to?",m2,"\n",
"What is P(A2)*P(B1) equal to?",m3,"\n",
"What is P(A2)*P(B2) equal to?",m4)
## True or False:
## P(A1B1)=P(A1)*P(B1)? FALSE
## P(A1B2)=P(A1)*P(B2)? FALSE
## P(A2B1)=P(A2)*P(B1)? FALSE
## P(A2B2)=P(A2)*P(B2)? FALSE
## What is P(A1)*P(B1) equal to? 0.43
## What is P(A1)*P(B2) equal to? 0.07
## What is P(A2)*P(B1) equal to? 0.43
## What is P(A2)*P(B2) equal to? 0.07
The graph is meant to represent and portray the GDP growth by percentage between the US and the Euro area. However, using the 3D bar graph causes for some confusion and some uncertainty about the data.One way to better represent that data would be to provide two separate lines/curves that show the change in GDP per year for each country.
install.packages("ggplot2", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/earth/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\earth\AppData\Local\Temp\Rtmpa0p2ww\downloaded_packages
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
#2 vectors
US <- c(3.7,4.5,4.2,4.5,3.7,0.8,1.6,2.7,4.2,3.5,2.9)
Euro <- c(1.5,2.6,2.9,2.8,3.7,1.8,0.9,0.7,1.9,1.0,1.8)
#Range
g_range <- range(0,US,Euro)
#Graph
plot(US, type='o', col="blue", ylim=g_range, axes=FALSE, ann=FALSE)
#X-Axis
axis(1, at=1:11, lab=c("1996","1997","1998","1999","2000","2001","2002","2003","2004","2005","2006"))
#Y-Axis
axis(2, las=1, at=4*0:g_range[2])
box()
lines(Euro, type="o", pch=22, lty=2, col="red")
title(main="The Transatlantic Gulf", col.main="red", font.main=4)
title(xlab="Years", col.lab=rgb(0,0.5,0))
title(ylab="GDP Growth %", col.lab=rgb(0,0.5,0))
legend(1, g_range[2], c("US", "Euro"), cex=0.8, col=c("blue", "red"), pch=21:22, lty=1:2)
p=0.58
n=900
alpha=0.02
z=qnorm(1-alpha/2)
se=sqrt(p*(1-p)/n)
lower=p-z*se
upper=p+z*se
interval=c(lower,p,upper)
names(interval)=c('Lower 1% CI', 'proportion estimate', 'Upper 1% CI')
interval
## Lower 1% CI proportion estimate Upper 1% CI
## 0.541727 0.580000 0.618273
With the sample of 900 computers to create a 98% confidence interval, the percentage of chips within the first 1000 hours to fail will be between 54.17% and 61.83%.Therefore the 54% that was stated to fail within the first 1000 hours by the company’s promotional literature falls outside the parameters and is incorrect.
# Hypothesis test
# Null: 54% of chips fail within the first 1000 hours of use. ; pi=0.54
# Alternative: More or less than 54% of chips fail within the first 1000 hours of use; pi does not = 0.54
H0 <- 0.54
alpha = 0.02
1-pnorm(p,H0,se)
## [1] 0.007521905
The p-value is much less than alpha, and therefore H0 is rejected and the alternative hypothesis is accepted. There is enough evidence to approve and agree with the manager’s claims that the actual percentage of chips that fail within the first 1000 hours of use is different than the percentage of 54%.