The experiment involves collecting data with the question in mind and will include analyzing the data to seek the answer
# Load the ToothGrowth dataset
data("ToothGrowth")
# Perform a two-sided t-test
t.test(x = ToothGrowth$len, alternative = "two.sided", mu = 18)
##
## One Sample t-test
##
## data: ToothGrowth$len
## t = 0.82361, df = 59, p-value = 0.4135
## alternative hypothesis: true mean is not equal to 18
## 95 percent confidence interval:
## 16.83731 20.78936
## sample estimates:
## mean of x
## 18.81333
is there a difference in tooth length based on supplement type?
# Perform a t-test
ToothGrowth_ttest <- t.test(len ~ supp, data = ToothGrowth)
# Load broom
library(broom)
# Tidy ToothGrowth_ttest
tidy(ToothGrowth_ttest)
## # A tibble: 1 x 10
## estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.70 20.7 17.0 1.92 0.0606 55.3 -0.171 7.57
## # ... with 2 more variables: method <chr>, alternative <chr>
Power: probability that the test correctly rejects the null hypothesis when the alternative hyothesis is true
Effect size: Standardized measure of the difference you’re trying to detect
Sample size: How many experimental units you need to survey to detect the desired difference at the desired power
We can also conduct a one-sided t-test, explicitly checking to see if the mean is less than or greater than 18. Whether to use a one- or two-sided test usually follows from your research question. Does an intervention cause longer tooth growth? One-sided, greater than. Does a drug cause the test group to lose more weight? One-sided, less than. Is there a difference in mean test scores between two groups of students? Two-sided test.
# Less than
t.test(x = ToothGrowth$len,
alternative = "less",
mu = 18)
##
## One Sample t-test
##
## data: ToothGrowth$len
## t = 0.82361, df = 59, p-value = 0.7933
## alternative hypothesis: true mean is less than 18
## 95 percent confidence interval:
## -Inf 20.46358
## sample estimates:
## mean of x
## 18.81333
# Greater than
t.test(x = ToothGrowth$len,
alternative = "greater",
mu = 18)
##
## One Sample t-test
##
## data: ToothGrowth$len
## t = 0.82361, df = 59, p-value = 0.2067
## alternative hypothesis: true mean is greater than 18
## 95 percent confidence interval:
## 17.16309 Inf
## sample estimates:
## mean of x
## 18.81333
# Load the pwr package
library(pwr)
## Warning: package 'pwr' was built under R version 4.0.3
# Calculate power
pwr.t.test(n = 100,
d = 0.35,
sig.level = 0.1,
type = "two.sample",
alternative = "two.sided",
power = NULL)
##
## Two-sample t test power calculation
##
## n = 100
## d = 0.35
## sig.level = 0.1
## power = 0.7943532
## alternative = two.sided
##
## NOTE: n is number in *each* group
# Calculate sample size
pwr.t.test(n = NULL,
d = 0.25,
sig.level = 0.05,
type = "one.sample",
alternative = "greater",
power = 0.8)
##
## One-sample t test power calculation
##
## n = 100.2877
## d = 0.25
## sig.level = 0.05
## power = 0.8
## alternative = greater
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
lendingclub <- read.csv("lendclub.csv")
# Examine the variables with glimpse()
glimpse(lendingclub)
## Rows: 1,500
## Columns: 12
## $ member_id <int> 55096114, 1555332, 1009151, 69524202, 72128084,...
## $ loan_amnt <int> 11000, 10000, 13000, 5000, 18000, 14000, 8000, ...
## $ funded_amnt <int> 11000, 10000, 13000, 5000, 18000, 14000, 8000, ...
## $ term <chr> "36 months", "36 months", "60 months", "36 mont...
## $ int_rate <dbl> 12.69, 6.62, 10.99, 12.05, 5.32, 16.99, 13.11, ...
## $ emp_length <chr> "10+ years", "10+ years", "3 years", "10+ years...
## $ home_ownership <chr> "RENT", "MORTGAGE", "MORTGAGE", "MORTGAGE", "MO...
## $ annual_inc <dbl> 51000, 40000, 78204, 51000, 96000, 47000, 40000...
## $ verification_status <chr> "Not Verified", "Verified", "Not Verified", "No...
## $ loan_status <chr> "Current", "Fully Paid", "Fully Paid", "Current...
## $ purpose <chr> "debt_consolidation", "debt_consolidation", "ho...
## $ grade <chr> "C", "A", "B", "C", "A", "D", "C", "A", "D", "B...
# Find median loan_amnt, mean int_rate, and mean annual_inc with summarize()
lendingclub %>% summarize(median(loan_amnt), mean(int_rate), mean(annual_inc))
## median(loan_amnt) mean(int_rate) mean(annual_inc)
## 1 13000 13.31472 75736.03
# Use ggplot2 to build a bar chart of purpose
ggplot(data=lendingclub, aes(x = purpose)) +
geom_bar() +
coord_flip()
# Use recode() to create the new purpose_recode variable
lendingclub$purpose_recode <- lendingclub$purpose %>% recode(
"credit_card" = "debt_related",
"debt_consolidation" = "debt_related",
"medical" = "debt_related",
"car" = "big_purchase",
"major_purchase" = "big_purchase",
"vacation" = "big_purchase",
"moving" = "life_change",
"small_business" = "life_change",
"wedding" = "life_change",
"house" = "home_related",
"home_improvement" = "home_related")
# Build a linear regression model, purpose_recode_model
purpose_recode_model <- lm(funded_amnt ~ purpose_recode, data = lendingclub)
# Examine results of purpose_recode_model
summary(purpose_recode_model)
##
## Call:
## lm(formula = funded_amnt ~ purpose_recode, data = lendingclub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14472 -6251 -1322 4678 25761
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9888.1 1248.9 7.917 4.69e-15 ***
## purpose_recodedebt_related 5433.5 1270.5 4.277 2.02e-05 ***
## purpose_recodehome_related 4845.0 1501.0 3.228 0.00127 **
## purpose_recodelife_change 4095.3 2197.2 1.864 0.06254 .
## purpose_recodeother -649.3 1598.3 -0.406 0.68461
## purpose_recoderenewable_energy -1796.4 4943.3 -0.363 0.71636
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8284 on 1494 degrees of freedom
## Multiple R-squared: 0.03473, Adjusted R-squared: 0.0315
## F-statistic: 10.75 on 5 and 1494 DF, p-value: 3.598e-10
# Get anova results and save as purpose_recode_anova
purpose_recode_anova <- anova(purpose_recode_model)
# Print purpose_recode_anova
purpose_recode_anova
## Analysis of Variance Table
##
## Response: funded_amnt
## Df Sum Sq Mean Sq F value Pr(>F)
## purpose_recode 5 3.6888e+09 737756668 10.75 3.598e-10 ***
## Residuals 1494 1.0253e+11 68629950
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Examine class of purpose_recode_anova
class(purpose_recode_anova)
## [1] "anova" "data.frame"
# Use aov() to build purpose_aov
purpose_aov <- aov(funded_amnt ~ purpose_recode, data = lendingclub)
# Conduct Tukey's HSD test to create tukey_output
tukey_output <- TukeyHSD(purpose_aov, "purpose_recode", conf.level = 0.95)
# Tidy tukey_output to make sense of the results
tidy(tukey_output)
## # A tibble: 15 x 7
## term contrast null.value estimate conf.low conf.high adj.p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 purpose_~ debt_related-bi~ 0 5434. 1808. 9059. 2.91e-4
## 2 purpose_~ home_related-bi~ 0 4845. 562. 9128. 1.61e-2
## 3 purpose_~ life_change-big~ 0 4095. -2174. 10365. 4.25e-1
## 4 purpose_~ other-big_purch~ 0 -649. -5210. 3911. 9.99e-1
## 5 purpose_~ renewable_energ~ 0 -1796. -15902. 12309. 9.99e-1
## 6 purpose_~ home_related-de~ 0 -589. -3056. 1879. 9.84e-1
## 7 purpose_~ life_change-deb~ 0 -1338. -6539. 3863. 9.78e-1
## 8 purpose_~ other-debt_rela~ 0 -6083. -9005. -3160. 5.32e-8
## 9 purpose_~ renewable_energ~ 0 -7230. -20894. 6434. 6.58e-1
## 10 purpose_~ life_change-hom~ 0 -750. -6429. 4929. 9.99e-1
## 11 purpose_~ other-home_rela~ 0 -5494. -9201. -1787. 3.58e-4
## 12 purpose_~ renewable_energ~ 0 -6641. -20494. 7212. 7.46e-1
## 13 purpose_~ other-life_chan~ 0 -4745. -10636. 1147. 1.95e-1
## 14 purpose_~ renewable_energ~ 0 -5892. -20482. 8698. 8.59e-1
## 15 purpose_~ renewable_energ~ 0 -1147. -15088. 12794. 1.00e+0
# Use aov() to build purpose_emp_aov
purpose_emp_aov <- aov(funded_amnt ~ purpose_recode + emp_length, data = lendingclub)
# Print purpose_emp_aov to the console
purpose_emp_aov
## Call:
## aov(formula = funded_amnt ~ purpose_recode + emp_length, data = lendingclub)
##
## Terms:
## purpose_recode emp_length Residuals
## Sum of Squares 3688783338 2044273211 100488872355
## Deg. of Freedom 5 11 1483
##
## Residual standard error: 8231.679
## Estimated effects may be unbalanced
# Call summary() to see the p-values
summary(purpose_emp_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## purpose_recode 5 3.689e+09 737756668 10.888 2.63e-10 ***
## emp_length 11 2.044e+09 185843019 2.743 0.00161 **
## Residuals 1483 1.005e+11 67760534
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Examine the summary of int_rate
summary(lendingclub$int_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.32 9.99 12.99 13.31 16.29 26.77
# Examine int_rate by grade
lendingclub %>%
group_by(grade) %>%
summarize(mean = mean(int_rate), var = var(int_rate), median = median(int_rate))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 7 x 4
## grade mean var median
## <chr> <dbl> <dbl> <dbl>
## 1 A 7.27 0.961 7.26
## 2 B 10.9 2.08 11.0
## 3 C 14.0 1.42 14.0
## 4 D 17.4 1.62 17.6
## 5 E 20.1 2.71 20.0
## 6 F 23.6 2.87 23.5
## 7 G 26.1 0.198 25.9
# Make a boxplot of int_rate by grade
ggplot(lendingclub, aes(x = grade, y = int_rate)) +
geom_boxplot()
# Use aov() to create grade_aov plus call summary() to print results
grade_aov <- aov(int_rate ~ grade, data = lendingclub)
summary(grade_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## grade 6 27013 4502 2637 <2e-16 ***
## Residuals 1493 2549 2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# For a 2x2 grid of plots:
par(mfrow = c(2, 2))
# Plot grade_aov
plot(grade_aov)
# Bartlett's test for homogeneity of variance
bartlett.test(int_rate ~ grade, data = lendingclub)
##
## Bartlett test of homogeneity of variances
##
## data: int_rate by grade
## Bartlett's K-squared = 78.549, df = 6, p-value = 7.121e-15
# Conduct the Kruskal-Wallis rank sum test
kruskal.test(int_rate ~ grade,
data = lendingclub)
##
## Kruskal-Wallis rank sum test
##
## data: int_rate by grade
## Kruskal-Wallis chi-squared = 1365.5, df = 6, p-value < 2.2e-16
# Load the pwr package
library(pwr)
# Use the correct function from pwr to find the sample size
pwr.t.test(n = NULL,
d = 0.2,
sig.level = 0.05,
power = 0.8,
alternative = "two.sided")
##
## Two-sample t test power calculation
##
## n = 393.4057
## d = 0.2
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
Probability sampling: probability is used to select the sample
Non probability sampling: probability is not used to select the sample
Simple random sampling: Every single unit in a population has an equal probaiblity of being sampled
Stratified sampling: splitting the population by some strata variables, such as race, gender or type
Divide the population into groups called clusters
systematic sampling involves choosing a sample in a systematic way, such every 5th or 10th
nhanes_final <- read.csv("nhanes_final.csv")
# Use sample_n() to create nhanes_srs
nhanes_srs <- nhanes_final %>% sample_n(2500)
# Create nhanes_stratified with group_by() and sample_n()
nhanes_stratified <- nhanes_final %>% group_by(riagendr) %>% sample_n(2000)
nhanes_stratified %>%
count(riagendr)
## # A tibble: 2 x 2
## # Groups: riagendr [2]
## riagendr n
## <int> <int>
## 1 1 2000
## 2 2 2000
# Load sampling package and create nhanes_cluster with cluster()
library(sampling)
## Warning: package 'sampling' was built under R version 4.0.3
nhanes_cluster <- cluster(nhanes_final, "indhhin2", 6, method = "srswor")
TukeyHSD
to determine which combinations are significantly different