Install and activate the packages for this assignment
# Install packages for this analysis
# List of packages maybe needed for this analysis
pkg <- c("reshape2",
"dplyr",
"ggplot2",
"knitr",
"stargazer",
"DT")
# Check if packages are not installed and assign the
# names of the packages not installed to the variable new.pkg
new.pkg <- pkg[!(pkg %in% installed.packages())]
# If there are any packages in the list that aren't installed,
# then install them
if (length(new.pkg)) {
install.packages(new.pkg, repos = "http://cran.rstudio.com")
}
# activate packages
load.pkg <- suppressWarnings(lapply(pkg, require, character.only=TRUE) )
# set working directory
# you only need to put the data and this file
# in the same folder
knitr::opts_chunk$set(root.dir = normalizePath("..")) ## [1] 2500 767
Potential variables:
ftpolice – How would you rate the police?
ftjournal – How would you rate journalists?
First, exploratory analysis on ftpolice (Police rating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 47.00 70.00 64.68 90.00 100.00
According to CLT, we know the sample mean of police rating is approximately normal, but I still want to see if it is true by doing simulation sampling
police_sample_mean_table <- replicate(n=1e5, expr = mean(sample(A$ftpolice, size=30, replace=F)))
hist(police_sample_mean_table, main="Histogram of Police Rating Sample Mean",xlab="Sample Mean of Police Rating")Now, exploratory analysis on ftjournal (Journal Rating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -7.00 21.00 52.00 52.26 82.00 100.00
journal_sample_mean_table <- replicate(n=1e5, expr = mean(sample(A$ftjournal, size=30, replace=F)))
hist(journal_sample_mean_table, main = "Histogram of Journal Rating Sample Mean", breaks = 100, xlab="Sample Mean of Journal Rating")Notice that ftpolice and ftjournal are not at the same data scales, so we need to normalize the data as follow:
Now compute the difference between the police rating and journal rating for every voter (in normalized scale)
diff_A <- normalize(A$ftpolice) - normalize(A$ftjournal)
hist(diff_A, main="Histogram of Normalized Difference \n Between Police and Journal Rating", breaks = 100,xlab="Police Rating - Journal Rating")\(H_0:\) The mean difference between the police and the journalists rating equals to zero
\(H_a:\) The mean difference between the police and the journalists rating does not equal to zero
sample_mean_table <- replicate(n=1e5, expr = mean(sample(diff_A, size=30, replace=F)))
hist(sample_mean_table, main="Histogram of Mean Normalized Difference \n Between Police and Journal Rating", breaks = 100,
xlab="Sample Mean of Police Rating - Journal Rating")## [1] 1
Since this is a survey, we should sample the data and run the paired t-test
##
## One Sample t-test
##
## data: diff_A[subset]
## t = 6.2138, df = 749, p-value = 8.58e-10
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.06424714 0.12359162
## sample estimates:
## mean of x
## 0.09391938
But if we only sample once, we may encounter some sampling error, so we so sample and run our paired t-test many many times to see how many times that we would reject our null hypothesis
# paired t test: subset sample and perform the test 10,000 times
q1_test_results <- data.frame(matrix(0,nrow=10000, ncol=4))
for (k in 1:10000){
subset <- sample(length(diff_A),floor(length(diff_A)*0.3))
result <- t.test(diff_A[subset])
q1_test_results[k,1] <-format(round(result$p.value, digits=4), nsmall = 4)
q1_test_results[k,c(2,3)] <- round(result$conf.int,4)
q1_test_results[k,4] <- round(result$conf.int[2]-result$conf.int[1],4)
}
colnames(q1_test_results) <- c("P value","95% Lower", "95% Upper","Confident Interval Range")
datatable(q1_test_results, class="cell-border stripe")## [1] 9859
Potential Variables:
birthyr birthyr: profile data: Birth Year
pid7x: Party ID Summary
1&3 = Dem
5&7 = Rep
First, subset the data and make it only includes two columns: birthyr and pid7x. For pid7x, we only want values 1, 2, 3, 5 and 7
q2_A <- A[,c("birthyr","pid7x")]
q2_A$Age <- 2019 - q2_A$birthyr
q2_A <- subset(q2_A, pid7x==1 | pid7x==3 | pid7x==5 | pid7x==7)
q2_A$party <- ifelse(q2_A$pid7x==1|q2_A$pid7x==3,"Dem","Rep")Check out the party distribution: how many people belong to each party in the data
Get the age vector for each party, and do EDA on the age vectors
dem_age <- q2_A[q2_A$party=="Dem",]$Age
rep_age <- q2_A[q2_A$party=="Rep",]$Age
hist(dem_age,main = "Histogram of Dem Age",breaks = 100)sample_mean_table_dem <- NULL
for (i in 1:10000){
sample_index <- sample(length(dem_age),30)
sample_mean_table_dem[i] <- mean(dem_age[sample_index])
}
hist(sample_mean_table_dem, main="Histogram of Sample Mean of Dem Age", breaks=100)sample_mean_table_rep <- NULL
for (i in 1:10000){
sample_index <- sample(length(rep_age),30)
sample_mean_table_rep[i] <- mean(rep_age[sample_index])
}
hist(sample_mean_table_rep, main="Histogram of Sample Mean of Rep Age", breaks = 100)diff <- sample_mean_table_dem-sample_mean_table_rep
hist(diff, main="Histogram of Sample Mean of Age Difference", breaks = 100)## [1] 0.999
\(H_0:\) the mean age of dem equals to the mean age of rep
\(H_a:\) the mean age of dem does not equal to the mean age of rep
Like what we did in question 1, we perform the hypothesis testing many many times to see how often we reject the null hypothesis.
# Perform two sameple t test 10000 on subset of the data
q2_test_results <- data.frame(matrix(0,nrow=10000, ncol=4))
colnames(q2_test_results) <- c("P value","95% Lower","95% Upper","Confident Range")
for (k in 1:10000){
subset_dem <- sample(length(dem_age),floor(length(dem_age)*0.3))
subset_rep <- sample(length(rep_age),floor(length(rep_age)*0.3))
result <- t.test(dem_age[subset_dem],rep_age[subset_rep])
q2_test_results[k,1] <- format(round(result$p.value, digits=4), nsmall = 4)
q2_test_results[k,c(2,3)] <- round(result$conf.int,4)
q2_test_results[k,4] <- round(result$conf.int[2]-result$conf.int[1],4)
}
datatable(q2_test_results)# the % of the test that reject the h0 at 5%
(nrow(q2_test_results[q2_test_results$`P value`<0.05,])/nrow(q2_test_results))## [1] 0.6017