This week’s coding goals

Shorten and understanding coding chunks for Fig. 3
Attempt to reproduce average controversiality ratings in the pilot study

Achieving the goals

Fig. 3

Starting off with the usual preparation of the data.

library(tidyverse)
library(dplyr)
library(ggplot2)
library(car)
library(ggeasy)
data <- read.csv("beliefsuperiority_all.csv")
data <- filter(data,Q62 == 1)

data_attn= filter(data,AC_a==3) %>% 
  filter(AC_b==5)

data_attn=dplyr::select(data_attn,-starts_with('AC'))


data_attn$Q37_2 = recode(data_attn$Q37_2, '1=9; 2=8; 3=7; 4=6; 6=4; 7=3; 8=2; 9=1')
data_attn$Q37_4 = recode(data_attn$Q37_4, '1=9; 2=8; 3=7; 4=6; 6=4; 7=3; 8=2; 9=1')
data_attn$Q37_5 = recode(data_attn$Q37_5, '1=9; 2=8; 3=7; 4=6; 6=4; 7=3; 8=2; 9=1')
data_attn$Q37_7 = recode(data_attn$Q37_7, '1=9; 2=8; 3=7; 4=6; 6=4; 7=3; 8=2; 9=1')
data_attn$Q37_10 = recode(data_attn$Q37_10, '1=9; 2=8; 3=7; 4=6; 6=4; 7=3; 8=2; 9=1')
data_attn$Q37_11 = recode(data_attn$Q37_11, '1=9; 2=8; 3=7; 4=6; 6=4; 7=3; 8=2; 9=1')
data_attn$Q37_13 = recode(data_attn$Q37_13, '1=9; 2=8; 3=7; 4=6; 6=4; 7=3; 8=2; 9=1')
data_attn$Q37_16 = recode(data_attn$Q37_16, '1=9; 2=8; 3=7; 4=6; 6=4; 7=3; 8=2; 9=1')
data_attn$Q37_18 = recode(data_attn$Q37_18, '1=9; 2=8; 3=7; 4=6; 6=4; 7=3; 8=2; 9=1')
data_attn$Q37_19 = recode(data_attn$Q37_19, '1=9; 2=8; 3=7; 4=6; 6=4; 7=3; 8=2; 9=1')

dogscale=dplyr::select(data_attn,starts_with('Q37'))
data_attn$meanDog=rowMeans(dogscale,na.rm = TRUE)

data_attn$vote_a = recode(data_attn$vote_a, '2=1; 3=2.3333; 4=3.6667; 5=5')
data_attn$torture_a = recode(data_attn$torture_a, '2=1; 3=2.3333; 4=3.6667; 5=5')
data_attn$affirmaction_a = recode(data_attn$affirmaction_a, '2=1; 3=2.3333; 4=3.6667; 5=5')

attitudes=dplyr::select(data_attn,ends_with('_a'))
data_attn$meanAtt=rowMeans(attitudes,na.rm = TRUE)

Based on the article, Fig.3 plots the belief-superiority rating per subject and per topic against the corresponding attitude rating. So I included the mean centered scores for all attitude questions.

#mean centered variables for average attitude and political orientation

data_attn$meanA_c= data_attn$meanAtt-mean(data_attn$meanAtt,na.rm=TRUE)
data_attn$PO_c= data_attn$Q12-mean(data_attn$Q12,na.rm=TRUE)

#individual attitude ratings
data_attn$immigration_a_c=data_attn$immigration_a-mean(data_attn$immigration_a,na.rm=TRUE)
data_attn$abortion_a_c=data_attn$abortion_a-mean(data_attn$abortion_a,na.rm=TRUE)
data_attn$vote_a_c=data_attn$vote_a-mean(data_attn$vote_a,na.rm=TRUE)
data_attn$tax_a_c=data_attn$tax_a-mean(data_attn$tax_a,na.rm=TRUE)
data_attn$torture_a_c=data_attn$torture_a-mean(data_attn$torture_a,na.rm=TRUE)
data_attn$affirmaction_a_c=data_attn$affirmaction_a-mean(data_attn$affirmaction_a,na.rm=TRUE)
data_attn$military_a_c=data_attn$military_a-mean(data_attn$military_a,na.rm=TRUE)
data_attn$covidgov_a_c=data_attn$covidgov_a-mean(data_attn$covidgov_a,na.rm=TRUE)

Like Fig. 1 and 2, I ran the original code to see what I’m replicating.

source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")

data_attn$subject=c(1:nrow(data_attn))

data_attnlong <- bind_cols(data_attn %>% dplyr::select(ends_with("_a_c"), subject,38:67,76) %>% gather(topic, attitude_c, ends_with("_a_c")), data_attn %>% dplyr::select(ends_with("_b"), subject) %>% gather(topic, beliefsup, ends_with("_b")), data_attn %>% dplyr::select(ends_with("_a"), subject) %>% gather(topic, attitude, ends_with("_a")))

data_attnlong$attitude=as.factor(data_attnlong$attitude)
forplot= data_attnlong[which(!is.na(data_attnlong$attitude)),]

raincloud_theme = theme(
  text = element_text(size = 10),
  axis.title.x = element_text(size = 16),
  axis.title.y = element_text(size = 16),
  axis.text = element_text(size = 14),
  axis.text.x = element_text(angle = 45, vjust = 0.5),
  legend.title=element_text(size=16),
  legend.text=element_text(size=16),
  legend.position = "right",
  plot.title = element_text(lineheight=.8, face="bold", size = 16),
  panel.border = element_blank(),
  panel.grid.minor = element_blank(),
  panel.grid.major = element_blank(),
  axis.line.x = element_line(colour = 'black', size=0.5, linetype='solid'),
  axis.line.y = element_line(colour = 'black', size=0.5, linetype='solid'))

ggplot(forplot, aes(x=attitude, y=beliefsup,color=attitude))+
  theme(legend.position= "none") +
  geom_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .8) +
  geom_point(aes(y = beliefsup, color = attitude), position = position_jitter(width = .15), 
             size = 1.5, alpha = 0.2) +
   stat_summary(fun.y=mean, size=2, color="black",geom="line", aes(group = 1)) +
  stat_summary(fun.y=mean, size=2, color="black",geom="point", aes(group = 1)) +
  stat_summary(fun.data = mean_cl_boot,geom='errorbar', fun.args=list(conf.int=.95), 
               size=1.5, aes(width=.3), color="black") +
  labs(x='Attitude', y='Belief Superiority') +
   theme_minimal()+
  theme(axis.title.y = element_text(size=16, face="bold"))+
  theme(axis.title.x = element_text(size=16, face="bold"))+
  theme(axis.text.y=element_text(color = "black", size = 14))+
  theme(axis.text.x=element_text(color = "black", size = 14))+
  theme(legend.text = element_text(color = "black", size = 14))+
  theme(legend.title = element_text(color = "black", size = 14))+
  theme(axis.line= element_line(color="black")) +
  theme(axis.ticks.y = element_line(color="black")) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(strip.text.x=element_text(color = "black", size = 14, face="bold"))+
  theme(legend.position = "none")+
  scale_x_discrete(labels = c("1","2","2.3","3","3.7","4","5"))

Looking at the code, I see that I can omit raincloud_theme alongside the majority of the themes.

The resulting shortened chunk looks like this.

source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")

data_attn$subject=c(1:nrow(data_attn))

data_attnlong <- bind_cols(data_attn %>% select(ends_with("_a_c"), subject, 38:67,76) %>% gather(topic, attitude_c, ends_with("_a_c")),data_attn %>% select(ends_with("_b"), subject) %>% gather(topic, beliefsup, ends_with("_b")), data_attn %>% select(ends_with("_a"), subject) %>% gather(topic, attitude, ends_with("_a")))

data_attnlong$attitude=as.factor(data_attnlong$attitude)
forplot = data_attnlong[which(!is.na(data_attnlong$attitude)),]

ggplot(forplot, aes(x=attitude, y=beliefsup, color=attitude)) +
  geom_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .8) + geom_point(aes(y = beliefsup), position = position_jitter(width = .15), 
             size = 1.5, alpha = 0.3) +
   stat_summary(fun.y=mean, size=2, color="black",geom="line", aes(group = 1)) +
  stat_summary(fun.y=mean, size=2, color="black",geom="point", aes(group = 1)) +
  stat_summary(fun.data = mean_cl_boot,geom='errorbar', fun.args=list(conf.int=.95), 
               size=1.5, aes(width=.3), color="black") +
  labs(x='Attitude', y='Belief Superiority') +
   theme_minimal() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  scale_x_discrete(labels = c("1","2","2.3","3","3.7","4","5"))+ easy_remove_legend() + theme(axis.line= element_line(color="black")) +
  theme(axis.ticks.y = element_line(color="black")) + scale_colour_manual(values = c("#CC0000", "#660000","#331900", "#000000", "#3399FF", "#0000FF", "#000066"))

Starting from the beginning, parameters for the data of interest is set via c() which, tells R to select data from data from rows 1 to the total no. of rows (nrow()) in the subject column of data_attn when doing later calculations.

A new dataframe (data_attnlong) was created to include data from combining specific columns (bind_cols()) from the data_attn. For example, select() is used to isolate columns that end with “a_c”, the subject column, and columns 38 - 67 as well as column 76. Gather() (from tidyr) is then used to make a new key column called “topic” that lists all column names ending in a_c. Gather() also creates a value column (attitude_c) that includes all the corresponding values listed under the aforementioned a_c topic columns. This is done again for belief superiority ratings (columns ending in "_b“) and attitude ratings (columns ending in”_a").

Since the attitude scores categorize participants as either very liberal (1) or very conservative (5) the “numeric” variables in the attitude_c value column are changes to factor variables (as.factor()).

Another dataframe (forplot) is created to identify the position of TRUE values (which()) in the attitude column from data_attnlong.

Is.na () is typically used to replace any missing values with “TRUE” but the ! before is.na reverses this process meaning that TRUE now indicates a complete value and FALSE indicates a NA value.

Square brackets is used to ask R to index these TRUE values in data_attnlong.

The comma after the closed brackets is essential and is makes it clearer for R when subsetting dataframes however, I don’t fully understand this function.

Hence, the researchers created “forplot” which only includes complete values that appear in the attitude column in data_attnlong.

Finally, to plot the values from forplot. Ggplot is used to plot attitude ratings on the x-axis and belief superiority ratings on the y-axis. The colour function was applied to only attitude ratings. Geom_flat_violin is used to create a half violin graph that is shifted via position_nudge() .2 points to the right. geom_point is used to plot the values in a scatterplot style.

Position nudge is used to shift discrete values hence, position_nudge was used for attitude ratings which was changed to a factor variable earlier.

The stat_summary() function from the ggplot 2 package was used to create: - line of best fit on the graph (geom = “line”) - mean point for each column (geom = “point”) - errorbars (geom = “errorbar”)

and fun.args() is used to list the 95% confidence intervals.

fun.data asks R to apply the summary function on on the data_attnlong dataframe

fun.y is a old function that specifies a summary on a vector which, in this case is the mean of each column

fun.data = mean_cl_boot uses bootstrapping to generate the CI. However, I don’t fully understand this function

Other aesthetics such as labels (via labs) was applied. Since the x-variable (attitude ratings) is a discrete variable, scale_x_discrete was used to apply labels on the x-axis. The background grid lines are separately (panel.grid.major/ panel.grid.minor) formatted to be blank. The axis lines were made to appear black (axis.line = element_line(color = “black”) and ggeasy (easy_remove_legend()) was used to remove the legend from the graph. Scale_colour_manual() was used to create a colour gradient similar to the original article.

Manual insertion of codes for colours was required since variables on the x-axis were not continuous.

IMPORTANT Like previous graphs, Fig. 3 was a flipped version of the original graph. However, unlike Fig. 1 and 2, it couldn’t be flipped by applying a negative sign before the x-variable which, reverses the order data is plotted (descending to ascending).

Pilot study

The pilot study only required me to replicate the controversiality ratings that the researchers generated. The aim of the pilot study was to identify and filter only topics that were deemed controversial enough to be included in the main study. The researchers used a criteria of a rating of 4 or above as being acceptable.

To do this I first downloaded the original pilot data from this link. Since it was .csv data from Excel, read.csv() was used to have R read the data. The which () function gives us the position of the element “1” in the PA column in the Pdata dataframe.

The square brackets were used to index/ extract specific values in Pdata.

Hence here, the researchers essentially extracted all the values coded as 1 from the PA column in the Pdata dataframe, and inserted these values into a new dataframe “Ds”. They did this for the Republicans (Rs) and Independents (Is).

C() is then used to create a value for overallmeans, Dmeans, Rmeans and Imeans in the global environment.

For() is used to tell R to repeat operations on the values in columns 9 to 30 (i.e. controversiality questions).

For the grand mean, R is instructed to first make a Mean dataset by calculating means from the values from the columns in the Pdata dataframe removing any NA values from its calculations. The “overallmeans” value in the global environment is then coded to include the previously calculate mean. This is done for the mean controversiality ratings for each political affiliation category as well.

Pdata=read.csv("pilotdata_all.csv")

Ds=Pdata[which(Pdata$PA==1),]
Rs=Pdata[which(Pdata$PA==2),]
Is=Pdata[which(Pdata$PA==3),]

overallmeans=c()
Dmeans=c()
Rmeans=c()
Imeans=c()
for (x in 9:30){
  Mean= colMeans(Pdata[x],na.rm = TRUE)
  overallmeans=c(overallmeans,Mean)
  Mean= colMeans(Ds[x],na.rm = TRUE)
  Dmeans=c(Dmeans,Mean)
  Mean= colMeans(Rs[x],na.rm = TRUE)
  Rmeans=c(Rmeans,Mean)
  Mean= colMeans(Is[x],na.rm = TRUE)
  Imeans=c(Imeans,Mean)
}

Viewing the values will show that Independents showed means lower than 4 for Q6_4 and Q6_12 which are topics regarding government assistance and Muslim religious rights. Thus, I’ve successfully reproduced the controversiality ratings.

overallmeans

##     Q6_2     Q6_3     Q6_4     Q6_5     Q6_6     Q6_7     Q6_8     Q6_9 
## 5.504762 5.761905 4.371429 4.685714 4.865385 4.923077 5.009524 4.904762 
##    Q6_10    Q6_11    Q6_12    Q6_13    Q6_14    Q6_15    Q6_16    Q6_17 
## 5.123810 5.257143 4.596154 5.211538 4.904762 5.352381 4.276190 4.567308 
##    Q6_18    Q6_19    Q6_20    Q6_21    Q6_22    Q22_1 
## 4.586538 4.952381 5.790476 4.942857 5.104762 4.137255

Dmeans

##     Q6_2     Q6_3     Q6_4     Q6_5     Q6_6     Q6_7     Q6_8     Q6_9 
## 5.760870 5.804348 4.782609 4.608696 4.755556 4.956522 4.891304 5.000000 
##    Q6_10    Q6_11    Q6_12    Q6_13    Q6_14    Q6_15    Q6_16    Q6_17 
## 5.065217 5.478261 4.711111 5.488889 5.065217 5.369565 4.152174 4.434783 
##    Q6_18    Q6_19    Q6_20    Q6_21    Q6_22    Q22_1 
## 4.565217 5.043478 5.739130 4.956522 5.108696 4.133333

Rmeans

##     Q6_2     Q6_3     Q6_4     Q6_5     Q6_6     Q6_7     Q6_8     Q6_9 
## 5.424242 5.969697 4.454545 4.909091 4.878788 5.030303 5.303030 4.757576 
##    Q6_10    Q6_11    Q6_12    Q6_13    Q6_14    Q6_15    Q6_16    Q6_17 
## 5.333333 5.424242 4.909091 5.030303 5.030303 5.363636 4.636364 4.843750 
##    Q6_18    Q6_19    Q6_20    Q6_21    Q6_22    Q22_1 
## 4.906250 4.818182 5.696970 5.272727 4.969697 4.064516

Imeans

##     Q6_2     Q6_3     Q6_4     Q6_5     Q6_6     Q6_7     Q6_8     Q6_9 
## 5.217391 5.304348 3.347826 4.521739 4.869565 4.727273 4.695652 4.826087 
##    Q6_10    Q6_11    Q6_12    Q6_13    Q6_14    Q6_15    Q6_16    Q6_17 
## 4.826087 4.739130 3.739130 4.956522 4.304348 5.217391 4.043478 4.347826 
##    Q6_18    Q6_19    Q6_20    Q6_21    Q6_22    Q22_1 
## 4.130435 4.869565 5.956522 4.304348 5.130435 4.434783

Challenges and successes

This week was relatively problem-free in terms of understanding the coding. I also had almost no issues running and knitting the chunks. However, a few missing commas did lead me to spend quite a while figuring out why the “undefined columns selected” error message was appearing.

Compared to other weeks however, this week has been the best week yet.

The next stage

Since, I’ve finished explaining and shortening all relevant coding, I’ll be starting my verification report and delving into exploratory analyses for next week.

Week 7 Learning Log

Fun Hui