# store probability of given variables
pD.F2 = .7
pD.notF2 = .1
pF2 = .6
# calculate probability of detection
pD = pD.F2 * pF2 + pD.notF2 * (1-pF2)
# calculate the probability of having the swine flu, given detection
pF2.D = pF2 * pD.F2 / pD
pF2.D
## [1] 0.9130435
The probability of having the swine flu, given detection, is .913045
# create pF2 vector and create empty vector to cointain calculated probabilities
p.F2 <- seq(0.1, 0.9, length=100)
p.F2.D.1 <- vector()
# fo every possible value of p.F2
for(idx in 1:length(p.F2)){
# calculate the probability of detection
pD = pD.F2 * p.F2[idx] + pD.notF2 * (1 - p.F2[idx])
# calculate probability of swine flu given detection
swine_flu_prob = p.F2[idx] * pD.F2 / pD
# append value to vector
p.F2.D.1[length(p.F2.D.1) + 1] <- swine_flu_prob
}
# create pD.not.F2 vector and create empty vector to contain calculated values
p.D.not.F2 <- seq(0.01, 0.5, length=100)
p.F2.D.2 <- vector()
# for every possible value of pD.not.F2
for(idx in 1:length(p.D.not.F2)){
# calculate the probability of detection
pD = pD.F2 * pF2 + p.D.not.F2[idx] * (1 - pF2)
# calculate probability of swine flu given detection
swine_flu_prob = pF2 * pD.F2 / pD
# append value to vector
p.F2.D.2[length(p.F2.D.2) + 1] <- swine_flu_prob
}
# plot two graphs side by side
par(mfrow=c(1,2))
# plot the probability of swine flu given detection as a function of the prevalence of swine flu
plot(p.F2, p.F2.D.1, type="l", ylim=c(0, 1), xlab="Prevalence of Swine Flu", ylab="Probability of Having Swine Flu,Given Detection", cex.lab=0.7, cex.axis=0.7)
# plot dashed line of expected pF2 value
abline(v=0.6, col="red", lty="dashed")
# plot the probability of swine flu given detection as a function of false detection rate
plot(p.D.not.F2, p.F2.D.2, type="l", ylim=c(0, 1), xlab="False Detection Rate", ylab="Probability of Having Swine Flu, Given Detection", cex.lab=0.7, cex.axis=0.7)
# plot dashed line of expected pD.notF2 value
abline(v=0.1, col="red", lty="dashed")
With a higher prevalence of swine flu, the probability of having swine flu, given detection increases as well. It increases at a decreasing rate.
With a higher false detection rate, the probability of having swine flu given deetection decreases, at a decreasing rate.
To preface my final answer, I want to reiterate that the probability of having swine flu given detection being lower means that the tested has recieved a false positive. It seems like it is worse to be uncertain about the the prevalence of swine flu, as the false positive rate in the tested range rises above 0.5, when the probability of having swine flu given detection dips below 0.5. This means, more often than not, people who test positive for swine flu are less likely to have the disease than have it. This is very bad for a detection protocol.
# import swine flu dataset
d1 <- read.csv(file="http://faraway.neu.edu/biostats/assn1_dataset1.csv")
H0: The prevalence of swine flu in individuals is not affected by age. HA: The prevalence of swine flu in individuals is significantly affected by age.
# store subjects with seasonal and swine flu into variables
d1_seasonalflu <- d1[d1$flu == "seasonal", ]
d1_swineflu <- d1[d1$flu == "swine", ]
# create two graphs stacked one on another
par(mfrow=c(2, 1))
# create a histogram of ages of people with seasonal flu
hist(d1_seasonalflu$age, main="Individuals with Seasonal Flu",
xlab="Ages", xlim=c(0, 90))
# create a histogram of ages of people with swine flu
hist(d1_swineflu$age, main="Individuals with Swine Flu",
xlab="Ages", xlim=c(0, 90))
The histograms indicate that swine flu targets adults and older adults, a wide variety of older ages, while the seasonal flu targets the young and the old.
# create data frames for people in between ages 18 and 65 and those younger than 18 and older than 65
d1_between_18_65 = d1[d1$age >=18 & d1$age<=65,]
d1_outside_18_65 <- d1[d1$age < 18 | d1$age > 65,]
# get counts of individuals grouped by sickness, ages 18-65 and ages 0-18, or 65+
d1_sum_between_18_65 = aggregate(age ~ flu, FUN=length, data=d1_between_18_65)
d1_sum_outside_18_65 = aggregate(age ~ flu, FUN=length, data=d1_outside_18_65)
# create merged dataframe with both aggregates
df.merge <- merge(d1_sum_between_18_65, d1_sum_outside_18_65, by="flu")
# change column names to age groups
colnames(df.merge)[2] = "adults"
colnames(df.merge)[3] = "children/elderly"
# print the new merged dataframe
print(df.merge)
## flu adults children/elderly
## 1 seasonal 7 73
## 2 swine 78 2
# perform chi squared test on the data
chisq.test(df.merge[2:3], )
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: df.merge[2:3]
## X-squared = 122.98, df = 1, p-value < 2.2e-16
The p-value is elss than 2.2e^-16, which makes the results significant, we reject the H0;