# Function for t-test Critical Values, Confidence interval, t-statistic & p-value
# given a vector of descriptive variables, or data Inputs: cl = confidence level
# in decimal form data = either sample data in vector form, or descriptive
# variables in vector form. Descriptive variables are c(bar_{x},mu,s,n) Outputs:
# for data - a confidence interval for descriptive statistics: a tibble with
# Critical t values, a Confidence interval, the t-Statistic for bar_{x}, the
# pValue for bar_{x}
tCrit <- function(cl, data, tail, mu = 0) {
f.pV.v <- function(x, data) {
pv <- pt(x, df = data[4] - 1)
return(pv)
}
f.pV.d <- function(x, data) {
pv <- pt(x, df = length(data) - 1)
return(pv)
}
f.tS.v <- function(data) {
ts.v <- (data[1] - data[2])/(data[3]/sqrt(data[4]))
return(ts.v)
}
f.TS.d <- function(data) {
ts.d <- (mean(data) - mu)/(sd(data)/sqrt(length(data)))
return(ts.d)
}
fcI.v <- function(cV, data) {
ci.v <- c(data[2] - (qt(cV, data[4] - 1) * data[3]/sqrt(data[4])), data[2] +
(qt(cV, data[4] - 1) * data[3]/sqrt(data[4])))
return(ci.v)
}
f.cI.d <- function(cV, data) {
ci.d <- c(mean(data) - qt(cV, length(data) - 1) * sd(data)/sqrt(length(data)),
mean(data) + qt(cV, length(data) - 1) * sd(data)/sqrt(length(data)))
}
output <- function(tc, ci, ts, pv) {
cValues <- tibble::tribble(~Variable, ~Values, "Low tCrit", tc[1], "High tCrit",
tc[2], "Conf_Int.Low", ci[1], "Conf_Int.Hi", ci[2], "T Statistic", ts,
"pValue", pv)
}
if (length(data) == 4) {
if (tail == "2" | tail == 2) {
cV <- (1 - cl)/2 + cl
ci <- fcI.v(cV, data)
ts <- f.tS.v(data)
tc <- c(-1 * qt(cV, data[4] - 1), qt(cV, data[4] - 1))
pv <- 2 * f.pV.v(-1 * abs(ts), data)
cValues <- output(tc, ci, ts, pv)
} else {
if (tail == "L" | tail == "l") {
cV <- (1 - cl)
ci <- fcI.v(cV, data)
ts <- f.tS.v(data)
tc <- c(qt(cV, data[4] - 1), NA)
pv <- f.pV.v(ts, data)
cValues <- output(tc, ci, ts, pv)
} else {
if (tail == "R" | tail == "r") {
cV <- cl
ci <- fcI.v(cV, data)
ts <- f.tS.v(data)
tc <- c(NA, qt(cV, data[4] - 1))
pv <- 1 - f.pV.v(ts, data)
cValues <- output(tc, ci, ts, pv)
}
}
}
} else if (length(data) > 4 | length(data) < 4) {
if (tail == "2" | tail == 2) {
cV <- (1 - cl)/2 + cl
ci <- f.cI.d(cV, data)
ts <- f.TS.d(data)
pv <- 2 * f.pV.d(-1 * abs(ts), data)
tc <- c(-1 * qt(cV, length(data) - 1), qt(cV, length(data) - 1))
cValues <- output(tc, ci, ts, pv)
} else {
if (tail == "L" | tail == "l") {
cV <- (1 - cl)
tc <- c(qt(cV, length(data) - 1), NA)
ci <- f.cI.d(cV, data)
ts <- f.TS.d(data)
pv <- f.pV.d(ts, data)
cValues <- output(tc, ci, ts, pv)
} else {
if (tail == "R" | tail == "r") {
cV <- cl
tc <- c(NA, qt(cV, length(data - 1)))
ci <- f.cI.d(cV, data)
ts <- f.TS.d(data)
pv <- 1 - f.pV.d(ts, data)
cValues <- output(tc, ci, ts, pv)
}
}
}
} else {
return(NA)
}
return(cValues)
}
# 2 Tailed t-test Inputs: cl = confidence level m1,m2 = means of sample 1 & 2
# s1,s2 = standard deviations of sample 1 & 2 n1,n2 = number of obs for sample 1
# & 2 m0 = Allows adjustment to the difference you want to test for, default 0
# var = Are the variances close enough to pool the samples?F=no, T=Yes, default F
# Output: tibble with statistic names in col1 '$variables' and values in col2
# '$values' See tibble labels for each output
t.2 <- function(cl, m1, m2, s1, s2, n1, n2, m0 = 0, var = F) {
pV <- function(x, df) {
pv <- 2 * pt(-abs(x), df)
}
cV <- (1 - cl)/2 + cl
if (var == FALSE) {
se <- sqrt((s1^2/n1) + (s2^2/n2))
df <- ((s1^2/n1 + s2^2/n2)^2)/((s1^2/n1)^2/(n1 - 1) + (s2^2/n2)^2/(n2 - 1))
} else {
df <- n1 + n2 - 2
sd <- sqrt(((n1 - 1) * s1^2 + (n2 - 1) * s2^2)/df)
se <- sqrt((1/n1 + 1/n2)) * sd
}
t <- (m1 - m2 - m0)/se
lc <- -1 * qt(cV, df)
hc <- qt(cV, df)
pv <- pV(t, df)
t2 <- tibble::tribble(~Variables, ~Values, "meanDiff", m1 - m2, "SE", se, "lCrit",
lc, "hCrit", hc, "t", t, "pValue", pv)
return(t2)
}For the R code herein to work, it will require the following steps:
1. unzip the uploaded PPUA_5301__Homework_5.zip with R or unzipping program of your choice.
2. Run the ‘Call Data’ chunk and the GSS chunk below it.
# setwd('C:\\Users\\Stephen\\Documents\\Northeastern\\PPUA 5301 -
# Introduction to Computational Statistics\\Homework\\Homework
# 5\\PPUA_5301__Homework_5')
this.dir <- dirname("Holsenbeck_S_5.Rmd")
setwd(this.dir)
getwd() #Check to see that it's the directory the file was opened from## [1] "C:/Users/Stephen/Documents/Northeastern/Git/ppua5301/Homework 5"
knitr::read_chunk("GSS.r")You hypothesize that the average person is smarter than Sarah Palin. You know her IQ is 100. You give an IQ test to 100 randomly selected people, and get a mean of 104 and standard deviation of 22. Please show your work for each question.
What is your null hypothesis? \(H_0=\mu=100\)
What is your research hypothesis? \(H_a=\mu>100\)
What is your test statistic? Since there are 100 obs in the sample, the data is expected to follow a normal distribution, and the z-stat should be sufficient. \(z = \frac{x-\bar{x}}{s}\)
\(z = \frac{104-100}{22}\)
\(z = \frac{4}{22}\)
\(z = .1818\)
or if you’re looking for the t-statistic: \(t_{stat}=\frac{\bar{x}-\mu}{\frac{s}{\sqrt{n}}}\)
\(t_{stat}=\frac{104-100}{\frac{22}{\sqrt{100}}}\)
\(t_{stat}=1.818\)
zs <- function(x, mu, sd) {
z <- (x - mu)/sd
return(z)
}
(z <- zs(104, 100, 22))## [1] 0.1818182
tCrit(0.95, c(104, 100, 22, 100), "r")## # A tibble: 5 x 2
## Variable Value
## <chr> <dbl>
## 1 High tCrit 1.66039116
## 2 Conf_Int.Low 96.34713946
## 3 Conf_Int.Hi 103.65286054
## 4 T Statistic 1.81818182
## 5 pValue 0.03603016
# For detail on tCrit or t.2 functons, see the Custom Fns chunk in the rmd
tCrit(0.95, c(104, 100, 22, 100), "r")## # A tibble: 5 x 2
## Variable Value
## <chr> <dbl>
## 1 High tCrit 1.66039116
## 2 Conf_Int.Low 96.34713946
## 3 Conf_Int.Hi 103.65286054
## 4 T Statistic 1.81818182
## 5 pValue 0.03603016
A)If \(t_{stat}>t_{crit}\) then the null hypothesis can be rejected. The calculation below indicates the null can be rejected in a one tailed test at the 95% confidence level. \(t_{stat}=1.818>t_{crit}=1.66\)
tCrit(0.95, c(104, 100, 22, 100), "r")## # A tibble: 5 x 2
## Variable Value
## <chr> <dbl>
## 1 High tCrit 1.66039116
## 2 Conf_Int.Low 96.34713946
## 3 Conf_Int.Hi 103.65286054
## 4 T Statistic 1.81818182
## 5 pValue 0.03603016
tCrit(0.95, c(104, 100, 22, 100), "2")## # A tibble: 6 x 2
## Variable Values
## <chr> <dbl>
## 1 Low tCrit -1.98421695
## 2 High tCrit 1.98421695
## 3 Conf_Int.Low 95.63472271
## 4 Conf_Int.Hi 104.36527729
## 5 T Statistic 1.81818182
## 6 pValue 0.07206032
What is your 95% confidence interval? \(\text{Confidence Interval}:CI=\bar{x}\pm t*\frac{s}{\sqrt{N}}\)
\(CI=100\pm 1.984*\frac{22}{\sqrt{100}}\)
\(CI=100\pm 4.3648\)
tCrit(0.95, c(104, 100, 22, 100), "2")## # A tibble: 6 x 2
## Variable Values
## <chr> <dbl>
## 1 Low tCrit -1.98421695
## 2 High tCrit 1.98421695
## 3 Conf_Int.Low 95.63472271
## 4 Conf_Int.Hi 104.36527729
## 5 T Statistic 1.81818182
## 6 pValue 0.07206032
A)It seems like everyone is saying something different about how a \(p_{value}\) relates to one v two tailed tests. Based on the information in this tutorial, the \(p_{value}\) is different depending on the test. IE for a 2 tailed test it is: \(p_{value}=2*\int\limits^{x}_{{-}\infty}f(-|t_{stat}|)dt\)
for a Left tailed test it is: \(p_{value}=\int\limits^{x}_{{-}\infty}f(t_{stat})dt\)
and for a Right tailed test it is: \(p_{value}=1-\int\limits^{x}_{{-}\infty}f(t_{stat})dt\)
Based on these formulas, the \(p_{value}\) is .03 for the right tailed test, rejecting the null hypothesis, and is .07 for the 2 tailed test, failing to reject the null hypothesis.
tCrit(0.95, c(104, 100, 22, 100), "2")## # A tibble: 6 x 2
## Variable Values
## <chr> <dbl>
## 1 Low tCrit -1.98421695
## 2 High tCrit 1.98421695
## 3 Conf_Int.Low 95.63472271
## 4 Conf_Int.Hi 104.36527729
## 5 T Statistic 1.81818182
## 6 pValue 0.07206032
You hypothesize that men and women have different skill levels in playing Tetris. To test this, you have 50 men and 50 women play the game in a controlled setting. The mean score of the men is 1124 with a standard deviation of 200 and the mean score for the women is 1245, also with a standard deviation of 200.
t.2(0.95, 1124, 1245, 200, 200, 50, 50, var = T)## # A tibble: 6 x 2
## Variables Values
## <chr> <dbl>
## 1 meanDiff -1.210000e+02
## 2 SE 4.000000e+01
## 3 lCrit -1.984467e+00
## 4 hCrit 1.984467e+00
## 5 t -3.025000e+00
## 6 pValue 3.174979e-03
You think drinking the night before an exam might help performance on the exam the next morning. To test this, you select 100 of your closest friends, and randomly get 50 of them to drink the night before the exam, which you denote the treatment group. The next day, the treatment group gets a mean of 78 with a standard deviation of 10 and the control group gets a 75 with a standard deviation of 5.
# tstat
(ts <- 3/sqrt(100/50 + 25/50))## [1] 1.897367
# df
(c <- 2/2.5)## [1] 0.8
(df <- 49 * 49/((49) * 0.8^2 + (1 - 0.8)^2 * (49)))## [1] 72.05882
# Critical Value
qt(0.975, df)## [1] 1.993436
t.2(0.95, 78, 75, 10, 5, 50, 50, var = F)## # A tibble: 6 x 2
## Variables Values
## <chr> <dbl>
## 1 meanDiff 3.00000000
## 2 SE 1.58113883
## 3 lCrit -1.99343576
## 4 hCrit 1.99343576
## 5 t 1.89736660
## 6 pValue 0.06178651
Using data of your choosing (or using simulated data), use R to conduct the following tests, and explain the results you get: ### a. A standard one-sample hypothesis test.
data <- rt(100, 99)
t.test(data)##
## One Sample t-test
##
## data: data
## t = 0.018249, df = 99, p-value = 0.9855
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.1703142 0.1734761
## sample estimates:
## mean of x
## 0.00158092
A difference-in-means test with independent samples.
View(GSS)
# Variable HEALTH Question: Would you say your own health, in general, is
# excellent, good, fair, or poor Answers in tibble below
tibble::tribble(~Value, ~Answer, 1L, "Excellent", 2L, "Good", 3L, "Fair", 4L, "Poor",
8L, "Don't know", 9L, "No answer", 0L, "Not applicable")## # A tibble: 7 x 2
## Value Answer
## <int> <chr>
## 1 1 Excellent
## 2 2 Good
## 3 3 Fair
## 4 4 Poor
## 5 8 Don't know
## 6 9 No answer
## 7 0 Not applicable
(Total.Resp <- GSS %>% group_by(YEAR) %>% summarise(n = n()))## # A tibble: 3 x 2
## YEAR n
## <int> <int>
## 1 2012 1974
## 2 2014 2538
## 3 2016 2867
(health_by_yr <- GSS %>% group_by(YEAR) %>% filter(HEALTH > 0 & HEALTH < 5) %>% summarise(mHealth = mean(HEALTH),
sHealth = sd(HEALTH), nHealth = n(), p.Resp = ))## # A tibble: 3 x 4
## YEAR mHealth sHealth nHealth
## <int> <dbl> <dbl> <int>
## 1 2012 2.069678 0.8535072 1306
## 2 2014 2.089474 0.8638472 1710
## 3 2016 2.131565 0.8266308 1885
health_by_yr <- cbind(health_by_yr, total.n = Total.Resp$n)
health_by_yr %>% mutate(p.Resp = nHealth/total.n)## YEAR mHealth sHealth nHealth total.n p.Resp
## 1 2012 2.069678 0.8535072 1306 1974 0.6616008
## 2 2014 2.089474 0.8638472 1710 2538 0.6737589
## 3 2016 2.131565 0.8266308 1885 2867 0.6574817
health_by_yr12 <- GSS %>% group_by(YEAR) %>% filter(HEALTH > 0 & HEALTH < 5, YEAR ==
2012)
health_by_yr16 <- GSS %>% group_by(YEAR) %>% filter(HEALTH > 0 & HEALTH < 5, YEAR ==
2016)
t.test(health_by_yr12$HEALTH, health_by_yr16$HEALTH)##
## Welch Two Sample t-test
##
## data: health_by_yr12$HEALTH and health_by_yr16$HEALTH
## t = -2.04, df = 2748.3, p-value = 0.04144
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.121370917 -0.002402242
## sample estimates:
## mean of x mean of y
## 2.069678 2.131565
SCISTUDY: Now, for a slightly different type of question. When you read news stories, you see certain sets of words and terms. We are interested in how many people recognize certain kinds of terms. First, some articles refer to the results of a scientific study. When you read or hear the term scientific study, do you have a clear understanding of what it means, a general sense of what it means, or little understanding of what it means?And we will compare their answers to the following questions:
ODD1: Now, think about this situation. A doctor tells a couple that their genetic makeup means that they have got one in four chances of having a child with an inherited illness. A. Does this mean that if their first child has the illness, the next three will not have the illness?
EARTHSUN: Now, I would like to ask you a few short questions like those you might see on a television game show. for each statement that I read, please tell me if it is true or false. If you don’t know or aren’t sure, just tell me so, and we will skip to the next question. Remember true, false, or don’t know. J. Now, does the Earth go around the Sun, or does the Sun go around the Earth?The t-test is framed as follows:
# Variable: SCISTUDY
SCISTUDY <- tibble::tribble(~Code, ~Label, 1L, "Clear.understanding", 2L, "General sense",
3L, "Little understanding", 8L, "Dont know", 9L, "No answer", 0L, "Not applicable")
# Variable: ODDS1
ODDS1 <- tibble::tribble(~Code, ~Label, 1L, "Yes", 2L, "No", 8L, "Dont know", 9L,
"No answer", 0L, "Not applicable")
# Variable: EARTHSUN
Earthsun <- tibble::tribble(~Code, ~Label, 1L, "Earth around sun", 2L, "Sun around earth",
8L, "Dont know", 9L, "No answer", 0L, "Not applicable")
(sci.s <- GSS %>% filter(SCISTUDY > 0) %>% group_by(ID_) %>% select(ID_, SCISTUDY,
ODDS1, EARTHSUN))## # A tibble: 3,631 x 4
## # Groups: ID_ [2,289]
## ID_ SCISTUDY ODDS1 EARTHSUN
## <int> <int> <int> <int>
## 1 2 2 2 1
## 2 3 2 2 1
## 3 4 2 2 1
## 4 5 1 2 1
## 5 8 8 2 1
## 6 10 1 2 1
## 7 13 1 2 1
## 8 15 2 1 1
## 9 16 2 1 1
## 10 17 2 8 1
## # ... with 3,621 more rows
switch <- function(x) {
if (x == 1) {
x <- 2
} else {
x <- 1
}
} # For switching answer 1 and 2 on EARTHSUN such that 1 is the wrong answer, as in ODDS1, so that the data can be compared via the paired t-test
sci.hi <- sci.s %>% filter((SCISTUDY == 1 | SCISTUDY == 2) & (EARTHSUN == 1 | EARTHSUN ==
2) & (ODDS1 == 1 | ODDS1 == 2)) %>% mutate(ES = switch(EARTHSUN))
t.test(sci.hi$ODDS1, sci.hi$ES, paired = T)##
## Paired t-test
##
## data: sci.hi$ODDS1 and sci.hi$ES
## t = 10.307, df = 2581, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.07965883 0.11708788
## sample estimates:
## mean of the differences
## 0.09837335
t.test(sci.hi$ODDS1, sci.hi$ES) #See the actual means##
## Welch Two Sample t-test
##
## data: sci.hi$ODDS1 and sci.hi$ES
## t = 10.129, df = 4769.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.07933338 0.11741332
## sample estimates:
## mean of x mean of y
## 1.903950 1.805577
mean(data)## [1] 0.00158092
sd(data)## [1] 0.8663122
0.0016/0.0866## [1] 0.01847575
qt(0.975, 99)## [1] 1.984217
tCrit(0.95, data, "2") #Check the answers## # A tibble: 6 x 2
## Variable Values
## <chr> <dbl>
## 1 Low tCrit -1.98421695
## 2 High tCrit 1.98421695
## 3 Conf_Int.Low -0.17031422
## 4 Conf_Int.Hi 0.17347606
## 5 T Statistic 0.01824885
## 6 pValue 0.98547706