Week 4 Homework
WG<-read.csv("C:/Users/kelse/OneDrive/Documents/Research Design Analysis/Files to Import/Winter_Grawunder_2012.csv", header=TRUE)
Question 1
a.
nrow(WG)
## [1] 84
ncol(WG)
## [1] 5
b.
table(WG$gender)
##
## F M
## 42 42
c.
# I didn't like the way that the data was being displayed before, it was way too (for lack of a better word) lumpy. So I am going to add breaks into my histogram so that I can visualize the data with more detail.
hist(WG$frequency[!is.na(WG$frequency)], main = "Distribution of Frequency", xlab = "Frequency", ylab = "Count", col = "lavender", breaks = 20, border = "lightblue", prob = TRUE)
# First I want to see what a normal curve will look like on the histogram.
x <- WG$frequency[!is.na(WG$frequency)]
curve(dnorm(x, mean = mean(x), sd = sd(x)), col = "orange", lwd = 2, add = TRUE)
# Then I am going to see what the density curve is for this dataset.
lines(density(WG$frequency[!is.na(WG$frequency)]), col = "purple", lwd = 2)

# The distribution peaks in two spots but it doesn't seem skewed far left or right, so I am not going to transform it.
Question 2
a.
boxplot(frequency ~ attitude, data = WG, main = "Frequency as a Function of Attitude", xlab = "Attitude", ylab = "Frequency", col = c("lightgreen", "lightpink"),names = c("Informal Speech", "Formal Speech"))

b.
WG_clean <- WG[!is.na(WG$frequency), ]
formal_speech <- WG_clean$frequency[WG_clean$attitude == "pol"]
informal_speech <- WG_clean$frequency[WG_clean$attitude == "inf"]
observed_diff <- mean(informal_speech) - mean(formal_speech)
set.seed(123)
n_iterations <- 1000
boot_diff <- numeric(n_iterations)
population <- c(formal_speech, informal_speech)
for (i in 1:n_iterations) {
sample_formal <- sample(population, length(formal_speech), replace = TRUE)
sample_informal <- sample(population, length(informal_speech), replace = TRUE)
boot_diff[i] <- mean(sample_informal) - mean(sample_formal)}
p_value <- mean(abs(boot_diff) >= abs(observed_diff))
observed_diff
## [1] 18.232
p_value
## [1] 0.207
c.
WG_clean <- WG[!is.na(WG$frequency), ]
formal_speech <- WG_clean$frequency[WG_clean$attitude == "pol"]
informal_speech <- WG_clean$frequency[WG_clean$attitude == "inf"]
t_test_result <- t.test(formal_speech, informal_speech, alternative = "two.sided")
t_test_result
##
## Welch Two Sample t-test
##
## data: formal_speech and informal_speech
## t = -1.2726, df = 80.938, p-value = 0.2068
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -46.73684 10.27285
## sample estimates:
## mean of x mean of y
## 184.3561 202.5881
d.
# The p-values are very similar. Since the P-value is ~0.207, which is bigger than 0.05(α), there is not enough evidence to reject the null hypothesis that there is no difference in speech frequencies between formal and informal speech.
Question 3
a.
female_speech <- WG$frequency[WG$gender == "F"]
set.seed(123)
n_iterations <- 1000
bootstrap_means <- numeric(n_iterations)
for (i in 1:n_iterations) {
bootstrap_sample <- sample(female_speech, length(female_speech), replace = TRUE)
bootstrap_means[i] <- mean(bootstrap_sample)
}
lower_confidence_limit <- quantile(bootstrap_means, 0.025)
upper_confidence_limit <- quantile(bootstrap_means, 0.975)
lower_confidence_limit
## 2.5%
## 236.4693
upper_confidence_limit
## 97.5%
## 256.5326
b.
female_speech <- WG$frequency[WG$gender == "F"]
mean_female <- mean(female_speech)
sd_female <- sd(female_speech)
n_female <- length(female_speech)
standard_error <- sd_female / sqrt(n_female)
confidence_interval_margin <- qt(0.975, df = n_female - 1) * standard_error
lower_confidence_limit <- mean_female - confidence_interval_margin
upper_confidence_limit <- mean_female + confidence_interval_margin
lower_confidence_limit
## [1] 236.198
upper_confidence_limit
## [1] 257.7735
c.
# The confidence intervals are very similar to each other using both methods. I am not sure if the numbers need to be more precisely aligned, but they seem extremely close to one another using each method.
d.
# It says that your idea that the mean female frequency is 260 probably isn't true because it isn't within the ~236 - 257 range that either of the methods gave me. I think this means that the mean female frequency would be a little bit lower and fall into that confidence interval range.