Packages

Install and activate the packages for this assignment

# Install packages for this analysis 
# List of packages maybe needed for this analysis
pkg <- c("reshape2",
         "dplyr",
         "ggplot2", 
         "knitr", 
         "stargazer",
         "DT")

# Check if packages are not installed and assign the
# names of the packages not installed to the variable new.pkg
new.pkg <- pkg[!(pkg %in% installed.packages())]

# If there are any packages in the list that aren't installed,
# then install them
if (length(new.pkg)) {
  install.packages(new.pkg, repos = "http://cran.rstudio.com")
}

# activate packages 
load.pkg <- suppressWarnings(lapply(pkg, require, character.only=TRUE) )

# set working directory
# you only need to put the data and this file
# in the same folder
knitr::opts_chunk$set(root.dir = normalizePath(".."))

Load the Data

A = read.csv("anes_pilot_2018.csv")
dim(A) # check the dimension of the data

## [1] 2500  767

Research Question 1

Do US voters have more respect for the police or for journalists?

Introduce your topic briefly. (5 points)

Potential variables:

ftpolice – How would you rate the police?
ftjournal – How would you rate journalists?

Perform an exploratory data analysis (EDA) of the relevant variables. (5 points)

First, exploratory analysis on ftpolice (Police rating)

summary(A$ftpolice)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   47.00   70.00   64.68   90.00  100.00

boxplot(A$ftpolice, main="Boxplot of Police Rating")

hist(A$ftpolice,breaks = 100, main="Histogram of Police Rating")

According to CLT, we know the sample mean of police rating is approximately normal, but I still want to see if it is true by doing simulation sampling

police_sample_mean_table <- replicate(n=1e5, expr = mean(sample(A$ftpolice, size=30, replace=F)))

hist(police_sample_mean_table, main="Histogram of Police Rating Sample Mean",xlab="Sample Mean of Police Rating")

Now, exploratory analysis on ftjournal (Journal Rating)

summary(A$ftjournal)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -7.00   21.00   52.00   52.26   82.00  100.00

boxplot(A$ftjournal, main="Boxplot of Journal Rating")

hist(A$ftjournal,breaks = 100,main="Boxplot of Journal Rating"
     , xlab="Journal Rating")

journal_sample_mean_table <- replicate(n=1e5, expr = mean(sample(A$ftjournal, size=30, replace=F)))

hist(journal_sample_mean_table, main = "Histogram of Journal Rating Sample Mean", breaks = 100, xlab="Sample Mean of Journal Rating")

Notice that ftpolice and ftjournal are not at the same data scales, so we need to normalize the data as follow:

# normalized function 
normalize <- function(x) {
    return ((x - min(x)) / (max(x) - min(x)))
}

Now compute the difference between the police rating and journal rating for every voter (in normalized scale)

diff_A <- normalize(A$ftpolice) - normalize(A$ftjournal)

hist(diff_A, main="Histogram of Normalized Difference \n Between Police and Journal Rating", breaks = 100,xlab="Police Rating - Journal Rating")

Based on your EDA, select an appropriate hypothesis test. (5 points)

\(H_0:\) The mean difference between the police and the journalists rating equals to zero

\(H_a:\) The mean difference between the police and the journalists rating does not equal to zero

sample_mean_table <- replicate(n=1e5, expr = mean(sample(diff_A, size=30, replace=F)))

hist(sample_mean_table, main="Histogram of Mean Normalized Difference \n Between Police and Journal Rating", breaks = 100,
     xlab="Sample Mean of Police Rating - Journal Rating")

mean(ifelse(sample_mean_table > 0 | sample_mean_table < 0, 1, 0))

## [1] 1

Conduct your test. (5 points)

Since this is a survey, we should sample the data and run the paired t-test

subset <- sample(length(diff_A),floor(length(diff_A)*0.3))
t.test(diff_A[subset])

## 
##  One Sample t-test
## 
## data:  diff_A[subset]
## t = 6.2138, df = 749, p-value = 8.58e-10
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.06424714 0.12359162
## sample estimates:
##  mean of x 
## 0.09391938

But if we only sample once, we may encounter some sampling error, so we so sample and run our paired t-test many many times to see how many times that we would reject our null hypothesis

# paired t test: subset sample and perform the test 10,000 times
q1_test_results <- data.frame(matrix(0,nrow=10000, ncol=4))

for (k in 1:10000){
  subset <- sample(length(diff_A),floor(length(diff_A)*0.3))
  result <- t.test(diff_A[subset])
  q1_test_results[k,1] <-format(round(result$p.value, digits=4), nsmall = 4) 
  q1_test_results[k,c(2,3)] <- round(result$conf.int,4)
  q1_test_results[k,4] <- round(result$conf.int[2]-result$conf.int[1],4)
}

colnames(q1_test_results) <- c("P value","95% Lower", "95% Upper","Confident Interval Range")

datatable(q1_test_results, class="cell-border stripe")

# number of tests reject the h0 at 5%
nrow(q1_test_results[q1_test_results$`P value`<0.05,])

## [1] 9859

Research Question 2

Are Republican voters older or younger than Democratic voters?

Introduce your topic briefly. (5 points)

Potential Variables:

birthyr birthyr: profile data: Birth Year
pid7x: Party ID Summary
- 1&3 = Dem
- 5&7 = Rep

Perform an exploratory data analysis (EDA) of the relevant variables. (5 points)

First, subset the data and make it only includes two columns: birthyr and pid7x. For pid7x, we only want values 1, 2, 3, 5 and 7

q2_A <- A[,c("birthyr","pid7x")]
q2_A$Age <- 2019 - q2_A$birthyr
q2_A <- subset(q2_A, pid7x==1 | pid7x==3 | pid7x==5 | pid7x==7)

q2_A$party <- ifelse(q2_A$pid7x==1|q2_A$pid7x==3,"Dem","Rep")

Check out the party distribution: how many people belong to each party in the data

bb <- barplot(table(q2_A$party))
text(bb,table(q2_A$party)-15,labels=table(q2_A$party),cex=.8)

Get the age vector for each party, and do EDA on the age vectors

dem_age <- q2_A[q2_A$party=="Dem",]$Age

rep_age <- q2_A[q2_A$party=="Rep",]$Age

hist(dem_age,main = "Histogram of Dem Age",breaks = 100)

sample_mean_table_dem <- NULL

for (i in 1:10000){
  sample_index <- sample(length(dem_age),30)
  sample_mean_table_dem[i] <- mean(dem_age[sample_index])
}

hist(sample_mean_table_dem, main="Histogram of Sample Mean of Dem Age", breaks=100)

hist(rep_age,main="Histogram of Rep Age", breaks = 100)

sample_mean_table_rep <- NULL

for (i in 1:10000){
  sample_index <- sample(length(rep_age),30)
  sample_mean_table_rep[i] <- mean(rep_age[sample_index])
}

hist(sample_mean_table_rep, main="Histogram of Sample Mean of Rep Age", breaks = 100)

diff <- sample_mean_table_dem-sample_mean_table_rep

hist(diff, main="Histogram of Sample Mean of Age Difference", breaks = 100)

mean(ifelse(diff > 0|diff <0, 1, 0))

## [1] 0.999

Based on your EDA, select an appropriate hypothesis test. (5 points)

\(H_0:\) the mean age of dem equals to the mean age of rep

\(H_a:\) the mean age of dem does not equal to the mean age of rep

Conduct your test. (5 points)

Like what we did in question 1, we perform the hypothesis testing many many times to see how often we reject the null hypothesis.

# Perform two sameple t test 10000 on subset of the data

q2_test_results <- data.frame(matrix(0,nrow=10000, ncol=4))
colnames(q2_test_results) <- c("P value","95% Lower","95% Upper","Confident Range")

for (k in 1:10000){
  subset_dem <- sample(length(dem_age),floor(length(dem_age)*0.3))
  subset_rep <- sample(length(rep_age),floor(length(rep_age)*0.3))
  result <- t.test(dem_age[subset_dem],rep_age[subset_rep])
  q2_test_results[k,1] <- format(round(result$p.value, digits=4), nsmall = 4) 
  q2_test_results[k,c(2,3)] <- round(result$conf.int,4)
  q2_test_results[k,4]  <- round(result$conf.int[2]-result$conf.int[1],4)
}

datatable(q2_test_results)

# the % of the test that reject the h0 at 5%
(nrow(q2_test_results[q2_test_results$`P value`<0.05,])/nrow(q2_test_results))

## [1] 0.6017

Lab 2 Solution

w203: Statistics for Data Science

October 25 2019

Packages

Load the Data

Research Question 1

Do US voters have more respect for the police or for journalists?

Introduce your topic briefly. (5 points)

Perform an exploratory data analysis (EDA) of the relevant variables. (5 points)

Based on your EDA, select an appropriate hypothesis test. (5 points)

Conduct your test. (5 points)

Research Question 2

Are Republican voters older or younger than Democratic voters?

Introduce your topic briefly. (5 points)

Perform an exploratory data analysis (EDA) of the relevant variables. (5 points)

Based on your EDA, select an appropriate hypothesis test. (5 points)

Conduct your test. (5 points)