Week 4 Homework

WG<-read.csv("C:/Users/kelse/OneDrive/Documents/Research Design Analysis/Files to Import/Winter_Grawunder_2012.csv", header=TRUE)

Question 1

a.

nrow(WG)

## [1] 84

ncol(WG)

## [1] 5

b.

table(WG$gender)

## 
##  F  M 
## 42 42

c.

# I didn't like the way that the data was being displayed before, it was way too (for lack of a better word) lumpy. So I am going to add breaks into my histogram so that I can visualize the data with more detail. 
hist(WG$frequency[!is.na(WG$frequency)], main = "Distribution of Frequency", xlab = "Frequency", ylab = "Count", col = "lavender", breaks = 20, border = "lightblue", prob = TRUE) 

# First I want to see what a normal curve will look like on the histogram. 
x <- WG$frequency[!is.na(WG$frequency)] 
curve(dnorm(x, mean = mean(x), sd = sd(x)), col = "orange", lwd = 2, add = TRUE)

# Then I am going to see what the density curve is for this dataset. 
lines(density(WG$frequency[!is.na(WG$frequency)]), col = "purple", lwd = 2)

# The distribution peaks in two spots but it doesn't seem skewed far left or right, so I am not going to transform it.

Question 2

a.

boxplot(frequency ~ attitude, data = WG, main = "Frequency as a Function of Attitude", xlab = "Attitude", ylab = "Frequency", col = c("lightgreen", "lightpink"),names = c("Informal Speech", "Formal Speech"))

b.

WG_clean <- WG[!is.na(WG$frequency), ]

formal_speech <- WG_clean$frequency[WG_clean$attitude == "pol"]
informal_speech <- WG_clean$frequency[WG_clean$attitude == "inf"]


observed_diff <- mean(informal_speech) - mean(formal_speech)

set.seed(123)  
n_iterations <- 1000
boot_diff <- numeric(n_iterations)

population <- c(formal_speech, informal_speech)

for (i in 1:n_iterations) {
sample_formal <- sample(population, length(formal_speech), replace = TRUE)
sample_informal <- sample(population, length(informal_speech), replace = TRUE)
boot_diff[i] <- mean(sample_informal) - mean(sample_formal)}

p_value <- mean(abs(boot_diff) >= abs(observed_diff))


observed_diff

## [1] 18.232

p_value

## [1] 0.207

c.

WG_clean <- WG[!is.na(WG$frequency), ]

formal_speech <- WG_clean$frequency[WG_clean$attitude == "pol"]
informal_speech <- WG_clean$frequency[WG_clean$attitude == "inf"]

t_test_result <- t.test(formal_speech, informal_speech, alternative = "two.sided")
t_test_result

## 
##  Welch Two Sample t-test
## 
## data:  formal_speech and informal_speech
## t = -1.2726, df = 80.938, p-value = 0.2068
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -46.73684  10.27285
## sample estimates:
## mean of x mean of y 
##  184.3561  202.5881

d.

# The p-values are very similar. Since the P-value is ~0.207, which is bigger than 0.05(α), there is not enough evidence to reject the null hypothesis that there is no difference in speech frequencies between formal and informal speech.

Question 3

a.

female_speech <- WG$frequency[WG$gender == "F"]

set.seed(123) 
n_iterations <- 1000
bootstrap_means <- numeric(n_iterations)


for (i in 1:n_iterations) {
  bootstrap_sample <- sample(female_speech, length(female_speech), replace = TRUE)
  bootstrap_means[i] <- mean(bootstrap_sample)
}


lower_confidence_limit <- quantile(bootstrap_means, 0.025)  
upper_confidence_limit <- quantile(bootstrap_means, 0.975)  

lower_confidence_limit

##     2.5% 
## 236.4693

upper_confidence_limit

##    97.5% 
## 256.5326

b.

female_speech <- WG$frequency[WG$gender == "F"]

mean_female <- mean(female_speech)
sd_female <- sd(female_speech)
n_female <- length(female_speech)

standard_error <- sd_female / sqrt(n_female)

confidence_interval_margin <- qt(0.975, df = n_female - 1) * standard_error

lower_confidence_limit <- mean_female - confidence_interval_margin
upper_confidence_limit <- mean_female + confidence_interval_margin

lower_confidence_limit

## [1] 236.198

upper_confidence_limit

## [1] 257.7735

c.

# The confidence intervals are very similar to each other using both methods. I am not sure if the numbers need to be more precisely aligned, but they seem extremely close to one another using each method.

d.

# It says that your idea that the mean female frequency is 260 probably isn't true because it isn't within the ~236 - 257 range that either of the methods gave me. I think this means that the mean female frequency would be a little bit lower and fall into that confidence interval range.

Week 4 Homework

Kelsey Deweese

2024-09-18

Week 4 Homework

Question 1

a.

b.

c.

Question 2

a.

b.

c.

d.

Question 3

a.

b.

c.

d.