User_Data <- read.table("./User_Data.csv", header = TRUE, sep = ",")
head(User_Data)
## User.ID Gender Age EstimatedSalary Purchased
## 1 15624510 Male 19 19000 0
## 2 15810944 Male 35 20000 0
## 3 15668575 Female 26 43000 0
## 4 15603246 Female 27 57000 0
## 5 15804002 Male 19 76000 0
## 6 15728773 Male 27 58000 0
User_Data$Gender <- factor(User_Data$Gender,
levels = c("Male", "Female"),
labels = c("Male", "Female"))
str(User_Data)
## 'data.frame': 400 obs. of 5 variables:
## $ User.ID : int 15624510 15810944 15668575 15603246 15804002 15728773 15598044 15694829 15600575 15727311 ...
## $ Gender : Factor w/ 2 levels "Male","Female": 1 1 2 2 1 1 2 2 1 2 ...
## $ Age : int 19 35 26 27 19 27 27 32 25 35 ...
## $ EstimatedSalary: int 19000 20000 43000 57000 76000 58000 84000 150000 33000 65000 ...
## $ Purchased : int 0 0 0 0 0 0 0 1 0 0 ...
summary(User_Data[,-c(1,5)])
## Gender Age EstimatedSalary
## Male :196 Min. :18.00 Min. : 15000
## Female:204 1st Qu.:29.75 1st Qu.: 43000
## Median :37.00 Median : 70000
## Mean :37.66 Mean : 69742
## 3rd Qu.:46.00 3rd Qu.: 88000
## Max. :60.00 Max. :150000
``` r
library(psych)
describeBy(User_Data$EstimatedSalary, User_Data$Gender)
##
## Descriptive statistics by group
## group: Male
## vars n mean sd median trimmed mad min max range skew
## X1 1 196 67642.86 32421.82 68000 65468.35 28910.7 15000 150000 135000 0.49
## kurtosis se
## X1 -0.14 2315.84
## ------------------------------------------------------------
## group: Female
## vars n mean sd median trimmed mad min max range skew
## X1 1 204 71759.8 35595.24 70500 69536.59 37806.3 15000 150000 135000 0.46
## kurtosis se
## X1 -0.7 2492.17
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(ggpubr)
Salary_Male <- ggplot(User_Data[User_Data$Gender == "Male", ], aes(x = EstimatedSalary)) +
geom_histogram(binwidth = 5000, col = "black", fill = "blue") +
xlab("Estimated Salary") +
ylab("Frequency") +
ggtitle("Salary Distribution - Male Employees")
Salary_Female <- ggplot(User_Data[User_Data$Gender == "Female", ], aes(x = EstimatedSalary)) +
geom_histogram(binwidth = 5000, col = "black", fill = "red") +
xlab("Estimated Salary") +
ylab("Frequency") +
ggtitle("Salary Distribution - Female Employees")
ggarrange(Salary_Male, Salary_Female, ncol = 2, nrow = 1)
library(ggpubr)
ggqqplot(User_Data,
"EstimatedSalary",
facet.by = "Gender")
#Variance Test
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
leveneTest(User_Data$EstimatedSalary, group = User_Data$Gender)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 2.7631 0.09725 .
## 398
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Normality Test
library (dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library (rstatix)
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
User_Data %>%
group_by(Gender) %>%
shapiro_test (EstimatedSalary)
## # A tibble: 2 × 4
## Gender variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 Male EstimatedSalary 0.962 0.0000349
## 2 Female EstimatedSalary 0.954 0.00000416
#Parametric Test
t.test(User_Data$EstimatedSalary ~ User_Data$Gender,
var.equal = TRUE,
alternative = "two.sided")
##
## Two Sample t-test
##
## data: User_Data$EstimatedSalary by User_Data$Gender
## t = -1.2079, df = 398, p-value = 0.2278
## alternative hypothesis: true difference in means between group Male and group Female is not equal to 0
## 95 percent confidence interval:
## -10817.701 2583.807
## sample estimates:
## mean in group Male mean in group Female
## 67642.86 71759.80
#Non-Parametric Test
wilcox.test(User_Data$EstimatedSalary ~ User_Data$Gender,
correct = FALSE,
exact = FALSE,
alternative = "two.sided")
##
## Wilcoxon rank sum test
##
## data: User_Data$EstimatedSalary by User_Data$Gender
## W = 18983, p-value = 0.3827
## alternative hypothesis: true location shift is not equal to 0
#Effect size
library(effectsize)
##
## Attaching package: 'effectsize'
## The following objects are masked from 'package:rstatix':
##
## cohens_d, eta_squared
## The following object is masked from 'package:psych':
##
## phi
effectsize(wilcox.test(User_Data$EstimatedSalary ~ User_Data$Gender,
correct = FALSE,
exact = FALSE,
alternative = "two.sided"))
## r (rank biserial) | 95% CI
## ---------------------------------
## -0.05 | [-0.16, 0.06]
interpret_rank_biserial(0.05)
## [1] "very small"
## (Rules: funder2019)