Week3_gp_comparison

#load data
gp<-read.csv("~/Downloads/GG identification across groups Wave 1.csv") %>%
  dplyr::select(CONDITION:Gender, LocationLatitude, LocationLongitude) %>%
  mutate(CONDITION=dplyr::recode(CONDITION, '0'="Experiment", '1'="Control"))

Part I: Control vs Experimental Groups

We first compare if there is any significant difference between participants from control group who only answered questions 9-17 and the other participants that rated the identity of all the different groups.

#Scaled mean value cleveland dot plot
new.gp<-gp %>% dplyr::select(CONDITION,Essentialism_SCL:Gender) 
Scale.gp <- new.gp
Scale.gp[,-1]<-scale(new.gp[,-1])

Scale.gp <- Scale.gp %>% 
  group_by(CONDITION) %>%
  summarize_all(mean,na.rm=TRUE)%>%
  gather( "variable", "Mean", Essentialism_SCL:Gender)

cd <- ggplot(Scale.gp, aes(x = Mean,
                            y = fct_reorder2(variable, CONDITION, -Mean),
                            color = CONDITION)) +
  geom_point() + ylab("") +
  ggtitle("Scaled Mean Value Sorted by Experiment Group")
cd

Interestingly, participants who rated the identity of all groups have very neutral attitude (close to value zero under the standardized normal distribution N(0,1)) towards question 9 to 17. While for people who were not assigned group identification questions, they have very diverse answer for question 9 to 17, which indicates that answering group identification questions may influence how participants think about ingroup and outgroup.

Part II: Social Group

We then move on to group (Italians, Mexicans, etc.) comparisons. We are interested in knowing whether different social groups perceive group identifications differently.

–1. diverging stacked bar chart

### Change Likert scores to factor and specify levels
gps <- gp %>% filter(CONDITION == "Experiment") %>%
  dplyr::select(Muslim_in_out_ID_prominance:Italian_in_out_ID_prominance) %>%
  gather("social_group", "in_out")
  
  gps$in_out <- as.factor(gps$in_out) #change -1,0,1 to factors
  
  gpss <- gps %>% group_by(social_group, in_out) %>% dplyr::summarise(count = n()) %>%
  spread(in_out, count) #make a dataframe counting in/outgroup identity for each group
  #gpss
  
  HH::likert(gpss, main = "In/Outgroup Identification of 8 Groups", positive.order = TRUE, 
             xlab = "count")

There are eight diverging stacked bars in the plot, each shows a different group identification (1: African, 2:Chinese, …, 8:Russian). Red on the left indicates the outgroup identity is higher and blue on the right indicates the ingroup. Since each participant is equally likely to finish/skip the group ingroup and outgroup identity question, taking the raw counts is as efficient as using percentage. Comparing first from the top and the second from the bottom bar, people remark African-Americans highest in ingroup and lowest in outgroup while Muslim-Americans more as outgroup.

–2. Grouped bar-plot comparing ingroup and outgroup identity for each group with Statistical significance test (ridgeline plot)

prepare data to draw comparison barplot

ds <- gp %>%
  tibble::rowid_to_column() %>%
  gather("group", value, Muslims_3:Italian_4) %>%
  dplyr::select(rowid, group, value)
  
ds$groups <-
  ifelse(ds$group == 'Muslims_3' | ds$group == 'Muslims_4', "Muslims",
  ifelse(ds$group == 'Chinese_3' | ds$group == 'Chinese_4', "Chinese",
  ifelse(ds$group == 'Hispanic_3' | ds$group == 'Hispanic_4', "Hispanic",
  ifelse(ds$group == 'Russian_3' | ds$group == 'Russian_4', "Russian",
  ifelse(ds$group == 'Irish_3' | ds$group == 'Irish_4', "Irish",
  ifelse(ds$group == 'African_3' | ds$group == 'African_4', "African",
  ifelse(ds$group == 'Mexican_3' |ds$group == 'Mexican_4',"Mexican","Italian")))))))
  
  ds$identity <-
  ifelse(grepl("^.+\\_(3)$", ds$group) == TRUE, "ingroup", "outgroup") #use regular expression to extract the last digit of group (eg:Muslims_4 -> 4)
  
  ds.mean <-
  ds %>% select(rowid, groups, identity, value) %>% spread(identity, value) %>%
  select(groups, ingroup, outgroup) %>% group_by(groups) %>%
  summarise_all(mean, na.rm = T) %>%
  gather(identity, mean, ingroup:outgroup)
  #ds.mean

Identification Comparison

ggplot(ds.mean, aes(identity, mean, fill = groups)) +
  geom_bar(stat = "identity", position = "dodge") +
  xlab("") + ggtitle("Ingroup & Outgroup Identification Comparison for each group")

ds.raw<-ds %>% select(groups, identity, value) %>% dplyr::rename(identification=identity)
#stat_compare_means(): easy to use solution to automatically add p-values and significance levels to a ggplot 
ggbarplot(ds.raw, x = "identification", y = "value", add = "mean_se",
          color = "groups", palette = "simpsons", 
          position = position_dodge(0.8))+
  stat_compare_means(aes(group = groups), label = "p.signif", label.y = 90)+ #add pairwise comparisons p-value
  stat_compare_means(method = "anova", label.y=100)+
  xlab("")+ylab("mean")#+ggtitle("Group Comparison for ingroup & outgroup")

ggbarplot(ds.raw, x = "identification", y = "value", add = "mean_se",
          fill = "groups", palette = "simpsons", 
          position = position_dodge(0.8))+
  stat_compare_means(aes(group = groups), label = "p.signif", label.y = 90)+ #add pairwise comparisons p-value
  stat_compare_means(method = "anova", label.y=100)+
  xlab("")+ylab("mean")#+ggtitle("Group Comparison for ingroup & outgroup")

  #Significance test for identification comparison
  ds.test <- ds %>% dplyr::select(groups, identity, value)
  
  #anova test results
  result <- data.frame(matrix(nrow = 2, ncol = 3))
  colnames(result) <- c("identity", "f_val", "p_val")
  
  for (i in unique(ds.test$identity)) {
  #f-val and p-val are ordered the same as in the original dataset
  aovtest <-
  anova(lm(value ~ groups, data = ds.test[which(ds.test$identity == i), ], var.equal =
  TRUE))
  result[i, 1] <- i
  result[i, 2] <- aovtest$`F value`[[1]]
  result[i, 3] <- aovtest$`Pr(>F)`[[1]]
  }

#glht stands for general linear hypothesis tests

#model: a fitted model, for example an object returned by aov().

#lincft(): a specification of the linear hypotheses to be tested. Multiple comparisons in ANOVA models are specified by objects returned from the function mcp().

#we use glht() to perform multiple pairwise-comparisons for a one-way ANOVA

#multiple comparisons, this may not be necessay due to large group number (almost 30 combinations)
  ds.test$groups <- as.factor(ds.test$groups)
  
  #summary(glht(lm(value ~ groups, data = ds.test[which(ds.test$identity =='ingroup'), ]), linfct = mcp(groups = "Tukey")))
 
   #summary(glht(lm(value ~ groups, data = ds.test[which(ds.test$identity =='outgroup'), ]), linfct = mcp(groups = "Tukey")))



  ds.compare <- result %>% slice(3:4)
  ds.compare$dir <-
  ifelse(ds.compare$f_val > 0, "positive", "negative")
  #ds.sig_level <- factor(ds.compare$sig_level, levels = c("sig", "marginally_sig", "non-sig")) #define the order of legend bar
  
  ds.compare$sig_level <- ifelse( ds.compare$p_val <= 0.05, "sig", ifelse(ds.compare$p_val > 0.1, "non_sig", "marginally_sig"))
  ds.compare

##   identity    f_val        p_val      dir sig_level
## 1  ingroup 21.49652 1.131514e-28 positive       sig
## 2 outgroup 33.44121 1.447726e-45 positive       sig

  p <- ggplot(ds.compare, aes(x = identity, y = p_val, fill = dir)) +
    scale_fill_manual(values = c(negative = "red", positive = "green")) +
    geom_bar(stat = "identity") +
    #ylab("Correlation with GG_Level")+
    #coord_flip()+
    xlab("") +
    ggtitle("Statistical significance check for group comparison")
    
    label.df <- ds.compare %>% dplyr::filter(ds.compare$p_val < 0.05)
    p + geom_text(data = label.df, label = "*")

Group Comparison

ggplot(ds.mean, aes(groups, mean, fill = identity)) +
  geom_bar(stat = "identity",  position = "dodge") +
  xlab("") + ggtitle("Group Comparison for ingroup & outgroup")

ggbarplot(ds.raw, x = "groups", y = "value", add = "mean_se",
          color = "identification", palette = "simpsons", 
          position = position_dodge(0.8))+
  stat_compare_means(aes(group = identification), label = "p.signif", label.y = 90)+ #add pairwise comparisons p-value
  stat_compare_means(method = "anova", label.y=100)+
  xlab("")+ylab("mean")#+ggtitle("Group Comparison for ingroup & outgroup")

ggbarplot(ds.raw, x = "groups", y = "value", add = "mean_se",
          fill = "identification", palette = "simpsons", 
          position = position_dodge(0.8))+
  stat_compare_means(aes(group = identification), label = "p.signif", label.y = 90)+ #add pairwise comparisons p-value
  stat_compare_means(method = "anova", label.y=100)+
  xlab("")+ylab("mean")#+ggtitle("Group Comparison for ingroup & outgroup")

#Significance test for group comparison
ds.test<-ds %>%dplyr::select(groups, identity, value)


result <- data.frame(matrix(nrow = 8, ncol = 3))
colnames(result)<-c("groups", "f_val", "p_val")
for (i in unique(ds.test$groups)) {#f-val and p-val are ordered the same as in the original dataset
  ttest<-t.test(value~identity, data=ds.test[which(ds.test$groups==i),], var.equal = TRUE)
  result[i,1]<-i
  result[i,2]<-ttest$statistic
  result[i,3]<-ttest$p.value
}

ds.compare<-result %>%slice(9:16)
ds.compare$dir <- ifelse(ds.compare$f_val > 0, "positive", "negative")
#ds.sig_level <- factor(ds.compare$sig_level, levels = c("sig", "marginally_sig", "non-sig")) #define the order of legend bar

ds.compare$sig_level <- ifelse(ds.compare$p_val <= 0.05, "sig",
                       ifelse(ds.compare$p_val > 0.1, "non_sig", "marginally_sig"))
#ds.compare

p<-ggplot(ds.compare, aes(x=groups, y=p_val, fill= dir))+
    scale_fill_manual(values=c(negative="red", positive="green"))+
    geom_bar(stat="identity")+
    #ylab("Correlation with GG_Level")+
    #coord_flip()+
    xlab("")+
    ggtitle("Statistical significance check for group comparison")

label.df <- ds.compare %>% dplyr::filter(ds.compare$p_val < 0.05)
p + geom_text(data = label.df, label = "*")

#compare <-ddply(ds, .(groups), function())
ggplot(ds, aes(x = value, y = groups, fill = group)) +
  geom_density_ridges(scale = 1, alpha = 0.5)

For every eight group, participants usually rate 100 score for either as American or as outgroup and there is limited difference between the distribution of as Americna or as outgroup except for African American.

–3. Geographic plot

–prepare the raw data

only experiment group is selected
wide to long table
group the data first by groups(Muslims, Chinese, etc,.) then by identification (ingroup/outgroup)

dss <- gp %>%
  dplyr::select(CONDITION,LocationLatitude, LocationLongitude, Muslims_3:Italian_4) %>%
  dplyr::filter(CONDITION=="Experiment") %>%
  gather("group", value, Muslims_3:Italian_4)

dss$groups<-ifelse(dss$group=='Muslims_3'|dss$group=='Muslims_4', "Muslims", 
                  ifelse(dss$group=='Chinese_3'|dss$group=='Chinese_4', "Chinese",
                  ifelse(dss$group=='Hispanic_3'|dss$group=='Hispanic_4', "Hispanic",
                  ifelse(dss$group=='Russian_3'|dss$group=='Russian_4', "Russian",
                  ifelse(dss$group=='Irish_3'|dss$group=='Irish_4', "Irish",
                  ifelse(dss$group=='African_3'|dss$group=='African_4', "African",
                  ifelse(dss$group=='Mexican_3'|dss$group=='Mexican_4',"Mexican","Italian")))))))

dss$identity <-ifelse(grepl("^.+\\_(3)$", dss$group) == TRUE, "ingroup", "outgroup") #use regular expression to extract the last digit of group (eg:Muslims_4 -> 4)

dss<-dss %>% dplyr::select(LocationLatitude, LocationLongitude, groups, identity, value) %>%
  na.omit()


###non-immigrant data
non_imm<- gp %>%
  dplyr::select(Immigrant, CONDITION,LocationLatitude, LocationLongitude, Muslims_3:Italian_4) %>%
  dplyr::filter(CONDITION=="Experiment") %>%
  gather("group", value, Muslims_3:Italian_4)

non_imm$groups<-ifelse(non_imm$group=='Muslims_3'|non_imm$group=='Muslims_4', "Muslims", 
                  ifelse(non_imm$group=='Chinese_3'|non_imm$group=='Chinese_4', "Chinese",
                  ifelse(non_imm$group=='Hispanic_3'|non_imm$group=='Hispanic_4', "Hispanic",
                  ifelse(non_imm$group=='Russian_3'|non_imm$group=='Russian_4', "Russian",
                  ifelse(non_imm$group=='Irish_3'|non_imm$group=='Irish_4', "Irish",
                  ifelse(non_imm$group=='African_3'|non_imm$group=='African_4', "African",
                  ifelse(non_imm$group=='Mexican_3'|non_imm$group=='Mexican_4',"Mexican","Italian")))))))

non_imm$identity <-ifelse(grepl("^.+\\_(3)$", non_imm$group) == TRUE, "ingroup", "outgroup") #use regular expression to extract the last digit of group (eg:Muslims_4 -> 4)

non_immi<-non_imm %>% dplyr::filter(Immigrant == 2) %>%
  dplyr::select(LocationLatitude, LocationLongitude, groups, identity, value) %>%
  na.omit()

gpp<-gp %>%
  dplyr::select(Immigrant, CONDITION,LocationLatitude, LocationLongitude, Muslims_3:Italian_4) %>%
  dplyr::filter(CONDITION=="Experiment")
imm_percent <-gpp %>% dplyr::select(Immigrant, LocationLatitude, LocationLongitude) %>%
  na.omit()%>%
  mutate(Immigrant=dplyr::recode(Immigrant, '2'="No", '1'="Yes"))

1.acquire US map information

2.convert lat+lon to state code

Map U.S. state-level data

states <- map_data("state")
#first, make a blank state map with state data using geom_polygon()
latlong2state <- function(pointsDF) {
    # Prepare SpatialPolygons object with one SpatialPolygon
    # per state (plus DC, minus HI & AK)
    states <- map('state', fill=TRUE, col="transparent", plot=FALSE)
    IDs <- sapply(strsplit(states$names, ":"), function(x) x[1])
    states_sp <- map2SpatialPolygons(states, IDs=IDs,
                     proj4string=CRS("+proj=longlat +datum=WGS84"))

    # Convert pointsDF to a SpatialPoints object 
    pointsSP <- SpatialPoints(pointsDF, 
                    proj4string=CRS("+proj=longlat +datum=WGS84"))

    # Use 'over' to get _indices_ of the Polygons object containing each point 
    indices <- over(pointsSP, states_sp)

    # Return the state names of the Polygons object containing each point
    stateNames <- sapply(states_sp@polygons, function(x) x@ID)
    stateNames[indices]
}
testPoints <- data.frame(x = as.numeric(dss$LocationLongitude), y=as.numeric(dss$LocationLatitude))

dss$state<-latlong2state(testPoints) 
sum(is.na(dss$state))

## [1] 512

non_immPoints <-data.frame(x = as.numeric(non_immi$LocationLongitude), y= as.numeric(non_immi$LocationLatitude))
non_immi$state<-latlong2state(non_immPoints) 
sum(is.na(non_immi$state))

## [1] 224

#write.csv(non_imm, file = "non_imm.csv")


imm_percentPoints <- data.frame(x= as.numeric(imm_percent$LocationLongitude), 
                                y = as.numeric(imm_percent$LocationLatitude))
imm_percent$state<-latlong2state(imm_percentPoints) 
sum(is.na(imm_percent$state))

## [1] 32

The above code works for most lat&lon location but contains 512 NAs due to rough mapping

statewinner <- dss %>%
  tibble::rowid_to_column()%>%
  dplyr::select(rowid, identity, value, state) %>% spread(identity, value) %>%
  select(-rowid) %>%
  group_by(state) %>%
  summarise_all(mean, na.rm = T) %>%
  mutate(identification = ifelse(ingroup>outgroup, "ingroup", "outgroup")) %>%
  select(state, identification) %>% dplyr::rename(region = state)
#head(statewinner)

non_immwinner<- non_immi %>%
  tibble::rowid_to_column()%>%
  dplyr::select(rowid, identity, value, state) %>% spread(identity, value) %>%
  select(-rowid) %>%
  group_by(state) %>%
  summarise_all(mean, na.rm = T) %>%
  mutate(identification = ifelse(ingroup>outgroup, "ingroup", "outgroup")) %>%
  select(state, identification) %>% dplyr::rename(region = state)


##calculate immigration percetage
imm.perce<-imm_percent %>%
  dplyr::filter(!is.na(state)) %>%
  dplyr::count(Immigrant, state) %>%
  dplyr::group_by(state) %>%
  dplyr::transmute(Immigrant, imm_Percentage=n/sum(n)*100)%>%
  dplyr::filter(Immigrant=="Yes") %>%
  dplyr::select(state, imm_Percentage) %>% dplyr::rename(region = state)

id_winner<-left_join(states, statewinner, by = "region")
nonimm_winner<-left_join(states, non_immwinner, by = "region")
per_imm<-left_join(states, imm.perce, by = "region")

p <- ggplot(data = id_winner,
            aes(x = long, y = lat,
                group = group, fill = identification))

p + geom_polygon(color = "gray90", size = 0.1) +
    coord_map(projection = "albers", lat0 = 39, lat1 = 45)+
  ggtitle("statewinner")

###non-immigrant
ni <- ggplot(data = nonimm_winner,
            aes(x = long, y = lat,
                group = group, fill = identification))

ni + geom_polygon(color = "gray90", size = 0.1) +
    coord_map(projection = "albers", lat0 = 39, lat1 = 45)+
  ggtitle("statewinner of non-immigrant")

###percentage of immigrants
perimm<-ggplot(data = per_imm, 
               aes(x = long, y=lat,
                   group = group, fill = imm_Percentage))+
  scale_fill_gradient(low="white", high="red")
  
perimm + geom_polygon(color = "gray90", size = 0.1) +
    coord_map(projection = "albers", lat0 = 39, lat1 = 45)+
  ggtitle("Immigration percentage")

ggplot() + geom_polygon(data = states, aes(x=long, y = lat, group = group), fill = "aliceblue", color = "black")+ coord_fixed(1.3)+
  geom_point(data = dss, aes(x= LocationLongitude, y=LocationLatitude, shape=groups, color = value))

#plot intergroup people more as ingroup
ds3<-gp %>%
  dplyr::select(CONDITION,LocationLatitude, LocationLongitude, Muslims_3, Chinese_3, Hispanic_3, Russian_3, Irish_3, African_3, Mexican_3, Italian_3) %>%
  dplyr::filter(CONDITION=="Experiment") %>%
  gather("group", value, Muslims_3:Italian_3)
GG3<-ggplot() + geom_polygon(data = states, aes(x=long, y = lat, group = group), fill = "aliceblue", color = "black")+ coord_fixed(1.3)+
  geom_point(data = ds3, aes(x= LocationLongitude, y=LocationLatitude, shape=group, color = value))+
  ggtitle("Feel intergroup people are American")

#plot intergroup people more as outgroup
ds4<-gp %>%
  dplyr::select(CONDITION,LocationLatitude, LocationLongitude, Muslims_4, Chinese_4, Hispanic_4, Russian_4, Irish_4, African_4, Mexican_4, Italian_4) %>%
  dplyr::filter(CONDITION=="Experiment") %>%
  gather("group", value, Muslims_4:Italian_4)
GG4<-ggplot() + geom_polygon(data = states, aes(x=long, y = lat, group = group), fill = "aliceblue", color = "black")+ coord_fixed(1.3)+
  geom_point(data = ds4, aes(x= LocationLongitude, y=LocationLatitude, shape=group, color = value))+
  ggtitle("Feel intergroup people are outgroup")
GG3;GG4

Week3_gp_comparison_revised

Weijia Bao

8/4/2019

Part I: Control vs Experimental Groups

prepare data to draw comparison barplot

Identification Comparison

Group Comparison