library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readxl)
library(coin)
## Loading required package: survival
library(DT)
data <- read_excel("/Users/pavelponomarev/Downloads/datadownload.xlsx")
data1 = data.frame(wwage = data$`Weekly male`, gender = "Male")
data2 = data.frame(wwage = data$`Weekly female`, gender = "Female")
data_test = rbind(data1,data2)
data_test$wwage =data_test$wwage %>% as.numeric()
## Warning in data_test$wwage %>% as.numeric(): в результате преобразования созданы
## NA
data_test$gender =data_test$gender %>% as.factor()
По итогу вот такая таблица и будем смотреть зависимость распределения wage от переменной gender
datatable(data_test)
chisq.test(data_test$wwage,data_test$gender)
## Warning in chisq.test(data_test$wwage, data_test$gender): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: data_test$wwage and data_test$gender
## X-squared = 699.05, df = 666, p-value = 0.1816
С хи-квадрат какое-то огромное p-value(((((
independence_test(wwage ~ gender, data = data_test)
##
## Asymptotic General Independence Test
##
## data: wwage by gender (Female, Male)
## Z = -6.1371, p-value = 8.403e-10
## alternative hypothesis: two.sided
t.test(wwage ~ gender, data = data_test)
##
## Welch Two Sample t-test
##
## data: wwage by gender
## t = -6.3441, df = 751.36, p-value = 3.864e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -133.42858 -70.36598
## sample estimates:
## mean in group Female mean in group Male
## 487.6926 589.5899