Data

library(foreign)
library(foreign)
library(car)
library(MASS)
library(tidyverse)
library(psych)
library(rstatix)
library(reshape2)
library(readr)
library(gridExtra)
library(dplyr)
library(effsize)

data <- read.spss("/Users/admin/Downloads/ESS8e02_1.sav", to.data.frame=TRUE)

Task

Chi-square (хихи хаха квадрат)

Combine groups in the data

library(dplyr)

data <- data %>% 
  select(ccgdbd, gndr)

data$ccgdbd1 <- case_when(
    data$ccgdbd == "1" | data$ccgdbd == "2" | data$ccgdbd == "3" | data$ccgdbd == "Extremely bad" ~ "neg",
    data$ccgdbd == "4" | data$ccgdbd == "5" | data$ccgdbd == "6" ~ "mid",
    data$ccgdbd == "7" | data$ccgdbd == "8" | data$ccgdbd == "9" | data$ccgdbd == "Extremely good" ~ "god",
    TRUE ~ "unknown"
)

data <- data %>% 
  filter(ccgdbd1 != "unknown")

Q ccgdbd: How good or bad do you think the impact of climate change will be on people across the world? Please choose a number from 0 to 10, where 0 is extremely bad and 10 is extremely good.

Research hypothesis: Females and males have different opinions toward the impact of climate change.

Check assumptions

class(data$ccgdbd1) 
## [1] "character"
data$ccgdbd1 <- as.factor(data$ccgdbd1)
class(data$ccgdbd1) 
## [1] "factor"
class(data$gndr)
## [1] "factor"
table(data$ccgdbd1, data$gndr)
##      
##        Male Female
##   god  1699   1770
##   mid  6531   7434
##   neg 11334  12459
data_tbl <- table(data$ccgdbd1, data$gndr)

Assumptions are met

H0: There are no difference in attitude towards the impact of climate change between males and females. HA: There are difference in attitude towards the impact of tclimate change between males and females.

Visualization

library(sjPlot)
plot_xtab(data$ccgdbd1, data$gndr, margin = "row", bar.pos = "stack",
         show.summary = TRUE)

Based on the graph it seems like there are no assosiation between gender and attitude towards climate change. Let’s run the test.

Chi-square

chisq.test(data$ccgdbd1, data$gndr)
## 
##  Pearson's Chi-squared test
## 
## data:  data$ccgdbd1 and data$gndr
## X-squared = 6.1849, df = 2, p-value = 0.04539
chi <- chisq.test(data$ccgdbd1, data$gndr)

P-value < 0,05 –> We reject H0 and accept HA It means that there are statistically significant difference in attitudes towards climate change between males and females.

chi$stdres
##             data$gndr
## data$ccgdbd1       Male     Female
##          god  1.8762290 -1.8762290
##          mid -2.0005472  2.0005472
##          neg  0.8622589 -0.8622589
stdres <- chi$stdres

library(corrplot)
corrplot(stdres, is.corr = FALSE, method = "num")

There are much less females, who think that impact of climate change will be good, than we expected. On the other hand there are much more males, who think like this, than it was expected. At the same time there are much more females who have moderate opinion than we expected, whereas there are much less males with moderate opinion than expected.

T-Test

Data cleaning

data1 <- read.spss("/Users/admin/Downloads/ESS8e02_1.sav", to.data.frame=TRUE)

data1 <- data1 %>% 
  select(ccgdbd, gndr)

class(data1$ccgdbd)
## [1] "factor"
data1$ccgdbd <- as.character(data1$ccgdbd)
data1$ccgdbd[data1$ccgdbd == "Extremely bad"] <- "0"
data1$ccgdbd[data1$ccgdbd == "Extremely good"] <- "10"
data1$ccgdbd <- as.numeric(data1$ccgdbd)

table(data1)
##       gndr
## ccgdbd Male Female
##     0  2632   2824
##     1  1646   1960
##     2  3381   3466
##     3  3675   4209
##     4  2419   2765
##     5  3112   3486
##     6  1000   1183
##     7   844    966
##     8   539    555
##     9   146    114
##     10  170    135
data1 <- data1 %>% 
  filter(ccgdbd <= 10)

table(data1)
##       gndr
## ccgdbd Male Female
##     0  2632   2824
##     1  1646   1960
##     2  3381   3466
##     3  3675   4209
##     4  2419   2765
##     5  3112   3486
##     6  1000   1183
##     7   844    966
##     8   539    555
##     9   146    114
##     10  170    135
str(data1)
## 'data.frame':    41232 obs. of  2 variables:
##  $ ccgdbd: num  10 2 2 7 5 5 5 3 4 5 ...
##  $ gndr  : Factor w/ 2 levels "Male","Female": 2 1 2 1 2 2 2 2 2 2 ...
##  - attr(*, "variable.labels")= Named chr [1:534] "Title of dataset" "ESS round" "Edition" "Production date" ...
##   ..- attr(*, "names")= chr [1:534] "name" "essround" "edition" "proddate" ...
##  - attr(*, "codepage")= int 65001

Assumptions

  1. Normality of two samples
library(ggplot2)
library(dbplyr)
library(gridExtra)

P1 <- ggplot(data_f <- data1 %>% 
         filter(gndr == "Female"), aes(x=ccgdbd))+
  geom_histogram(binwidth = 0.5)
  
P2 <- ggplot(data_m <- data1 %>% 
         filter(gndr == "Male"), aes(x=ccgdbd))+
  geom_histogram(binwidth = 0.5)

grid.arrange(P1, P2, ncol=2)

Based on the graph we see that data is clearly not normal: there are two пики and the data is right-skewed. However: our sample is very large (that’s why for example shapiro.test function is not working), therefore, according to CLT, it should be okay?? I guess, fix my mistake if im wrong.

  1. Variances
ggplot(data1 %>% 
         filter(gndr != "NA"), aes(y=ccgdbd, x=gndr))+
  geom_boxplot()

leveneTest(data1$ccgdbd ~ data1$gndr)
## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value  Pr(>F)  
## group     1   4.524 0.03343 *
##       41225                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Variances are equal. If we assume that CLT works, than assumptions are met??

t.test(data1$ccgdbd ~ data1$gndr)
## 
##  Welch Two Sample t-test
## 
## data:  data1$ccgdbd by data1$gndr
## t = 0.15752, df = 40610, p-value = 0.8748
## alternative hypothesis: true difference in means between group Male and group Female is not equal to 0
## 95 percent confidence interval:
##  -0.03913373  0.04597326
## sample estimates:
##   mean in group Male mean in group Female 
##             3.266357             3.262937

P-value is very large… What happens???

Let’s check non-parametric test..

wilcox.test(data1$ccgdbd ~ data1$gndr, data = data1)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  data1$ccgdbd by data1$gndr
## W = 211045844, p-value = 0.4705
## alternative hypothesis: true location shift is not equal to 0

P-value is also very large… We cannot reject H0. Results of two test controdict each other. Or i did smth wrong…

library(cowsay)

say(what="Data analysis is often confusing", by = "random")
## 
##  ----- 
## Data analysis is often confusing 
##  ------ 
##     \   ^__^ 
##      \  (oo)\ ________ 
##         (__)\         )\ /\ 
##              ||------w|
##              ||      ||