data()
data(package= .packages(all.available = TRUE))

Research Question: Is there a difference in wage between male and female?

#install.packages("carData")
library(carData)
mydata <- force(SLID)

head(mydata)
##   wages education age    sex language
## 1 10.56      15.0  40   Male  English
## 2 11.00      13.2  19   Male  English
## 3    NA      16.0  49   Male    Other
## 4 17.76      14.0  46   Male    Other
## 5    NA       8.0  71   Male  English
## 6 14.00      16.0  50 Female  English

##Description:

Data manipulation and descriptive statistics

set.seed(1) 
mydata <- mydata[sample(nrow(mydata), 100), ]  
library(tidyr)
mydata <- drop_na(mydata) 
mydata$sexF <- factor(mydata$sex,
                        levels = c("Male", "Female"),
                        labels = c("Male","Female"))

mydata$languageF <- factor(mydata$language,
                        levels = c("English", "Other"),
                        labels = c("Enlish","Other"))
library(psych)

describeBy(x=mydata$wages, group=mydata$sexF)
## 
##  Descriptive statistics by group 
## group: Male
##    vars  n  mean  sd median trimmed  mad  min max range skew kurtosis   se
## X1    1 29 18.15 8.5  18.29   17.46 8.47 6.35  48 41.65 1.22     2.87 1.58
## ------------------------------------------------------------ 
## group: Female
##    vars  n  mean    sd median trimmed  mad  min   max range skew kurtosis   se
## X1    1 18 16.61 10.66  12.97    15.8 7.37 6.08 40.05 33.97 0.88    -0.74 2.51

Interpretation

##Male

##Female

Statistical hypotesis

###Assumptions that need to be met:

    1. Variable is numeric
    1. The distribution of the variable is normal in both populations
    1. The data must come from two independent populations
    1. Variable has the same variance in both populations – since this assumption is often violated, we apply Welch correction.
  • First assumption is fulfilled since wage is expressed in Canadian dollars, it is numerical variable

  • Second assumption should be checked with Shapiro-Wilk test and we will check it now

Shapiro-Wilk test

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rstatix)
## Warning: package 'rstatix' was built under R version 4.3.2
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
mydata %>%
  group_by(sexF) %>%
  shapiro_test(wages)
## # A tibble: 2 × 4
##   sexF   variable statistic       p
##   <fct>  <chr>        <dbl>   <dbl>
## 1 Male   wages        0.887 0.00478
## 2 Female wages        0.834 0.00475
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
ggplot(mydata, aes (x = wages)) +
geom_histogram(binwidth = 5, colour="grey" , fill= "lightblue") +
facet_wrap(~sexF, ncol = 1) +
ylab("'Frequency")

remove_high_wages <- function(df, wage_threshold = 43) {
  df[df$wages <= wage_threshold, ]
}
t.test(mydata$wages ~ mydata$sexF,
         paired= FALSE,
         var.equal = FALSE,
         alternative = "two.sided") 
## 
##  Welch Two Sample t-test
## 
## data:  mydata$wages by mydata$sexF
## t = 0.52075, df = 30.212, p-value = 0.6063
## alternative hypothesis: true difference in means between group Male and group Female is not equal to 0
## 95 percent confidence interval:
##  -4.512310  7.602271
## sample estimates:
##   mean in group Male mean in group Female 
##             18.15276             16.60778
library(effectsize)
## Warning: package 'effectsize' was built under R version 4.3.2
## 
## Attaching package: 'effectsize'
## The following objects are masked from 'package:rstatix':
## 
##     cohens_d, eta_squared
## The following object is masked from 'package:psych':
## 
##     phi
effectsize ::cohens_d(mydata$wages ~ mydata$sexF,
                    pooled_sd - FALSE)
## Cohen's d |        95% CI
## -------------------------
## 0.16      | [-0.43, 0.75]
## 
## - Estimated using pooled SD.
interpret_cohens_d (0.16, rules = "sawilowsky2009")
## [1] "very small"
## (Rules: sawilowsky2009)

Wilcoxon Sum Rank Test

library(psych)
describeBy(mydata$wages, mydata$sexF) #Descriptive statistics for wages based on gender
## 
##  Descriptive statistics by group 
## group: Male
##    vars  n  mean  sd median trimmed  mad  min max range skew kurtosis   se
## X1    1 29 18.15 8.5  18.29   17.46 8.47 6.35  48 41.65 1.22     2.87 1.58
## ------------------------------------------------------------ 
## group: Female
##    vars  n  mean    sd median trimmed  mad  min   max range skew kurtosis   se
## X1    1 18 16.61 10.66  12.97    15.8 7.37 6.08 40.05 33.97 0.88    -0.74 2.51
wilcox.test(mydata$wages ~ mydata$sexF, 
            paired = FALSE,
            correct = FALSE,
            exact = FALSE,
            alternative = "two.sided")
## 
##  Wilcoxon rank sum test
## 
## data:  mydata$wages by mydata$sexF
## W = 305, p-value = 0.3355
## alternative hypothesis: true location shift is not equal to 0
library(effectsize)
effectsize(wilcox.test(mydata$wages ~ mydata$sexF,
                       paired = FALSE,
                       correct = FALSE,
                       exact = FALSE,
                       alternative = "two.sided"))
## r (rank biserial) |        95% CI
## ---------------------------------
## 0.17              | [-0.17, 0.47]
interpret_rank_biserial(0.17)
## [1] "small"
## (Rules: funder2019)