# load data
library(tidyverse)
library(psych)
library(infer)
library(openintro)
library(ggplot2)
# read the csv file
salary <- read_csv("https://raw.githubusercontent.com/AnnaMoy/portfolio/refs/heads/main/Employer's%20Salary%20Job%20Title%20and%20Country%20Information/Salary_Data.csv")
salary
## # A tibble: 6,704 × 6
## Age Gender `Education Level` `Job Title` `Years of Experience` Salary
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 32 Male Bachelor's Software Engineer 5 90000
## 2 28 Female Master's Data Analyst 3 65000
## 3 45 Male PhD Senior Manager 15 150000
## 4 36 Female Bachelor's Sales Associate 7 60000
## 5 52 Male Master's Director 20 200000
## 6 29 Male Bachelor's Marketing Analyst 2 55000
## 7 42 Female Master's Product Manager 12 120000
## 8 31 Male Bachelor's Sales Manager 4 80000
## 9 26 Female Bachelor's Marketing Coordi… 1 45000
## 10 38 Male PhD Senior Scientist 10 110000
## # ℹ 6,694 more rows
dim(salary)
## [1] 6704 6
# Tidying up the data
# renaming the column names
colnames(salary) <- c("age", "gender", "education", "job_title", "yrs_experience", "salary")
# remove missing data
salary <- salary %>%
drop_na()
#add grouping years of experience
salary <- salary %>%
mutate(yrs = case_when(yrs_experience <= 5 ~"0-5",
yrs_experience > 5 & yrs_experience <= 10 ~ "6-10",
yrs_experience > 10 & yrs_experience <= 15 ~ "11-15",
yrs_experience > 15 & yrs_experience <= 20 ~ "16-20",
yrs_experience > 20 ~ "20+"))
# order for the charts
salary <- salary %>%
mutate(yrs = fct_relevel(yrs,"0-5","6-10","11-15","16-20","20+"))
# grouping for education into buckets
salary$degree<-ifelse(grepl("Bachelor's*", salary$education, ignore.case= TRUE), "Bachelor",
ifelse(grepl("Master's*", salary$education, ignore.case= TRUE), "Master",
ifelse(grepl("phd", salary$education, ignore.case= TRUE), "PhD",
ifelse(grepl("*High School", salary$education, ignore.case= TRUE),"High School","other"))))
#order for the charts
salary <- salary %>%
mutate(degree = fct_relevel(degree, "High School", "Bachelor", "Master", "PhD"))
salary <- salary %>%
mutate(range = case_when(age <= 29 ~"20-29",
age > 29 & yrs_experience <= 39 ~ "30-39",
age > 39 & yrs_experience <= 49 ~ "40-49",
age > 49 & yrs_experience <= 59 ~ "50-59",
age >59 ~ "60+"))
# order for the charts
salary <- salary %>%
mutate(range = fct_relevel(yrs,"20-29","30-39","40-49","50-59","60+"))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `range = fct_relevel(yrs, "20-29", "30-39", "40-49", "50-59",
## "60+")`.
## Caused by warning:
## ! 5 unknown levels in `f`: 20-29, 30-39, 40-49, 50-59, and 60+
salary
## # A tibble: 6,698 × 9
## age gender education job_title yrs_experience salary yrs degree range
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <fct> <fct> <fct>
## 1 32 Male Bachelor's Software En… 5 90000 0-5 Bache… 0-5
## 2 28 Female Master's Data Analyst 3 65000 0-5 Master 0-5
## 3 45 Male PhD Senior Mana… 15 150000 11-15 PhD 11-15
## 4 36 Female Bachelor's Sales Assoc… 7 60000 6-10 Bache… 6-10
## 5 52 Male Master's Director 20 200000 16-20 Master 16-20
## 6 29 Male Bachelor's Marketing A… 2 55000 0-5 Bache… 0-5
## 7 42 Female Master's Product Man… 12 120000 11-15 Master 11-15
## 8 31 Male Bachelor's Sales Manag… 4 80000 0-5 Bache… 0-5
## 9 26 Female Bachelor's Marketing C… 1 45000 0-5 Bache… 0-5
## 10 38 Male PhD Senior Scie… 10 110000 6-10 PhD 6-10
## # ℹ 6,688 more rows
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
Is there a relationship between salary and gender, years of experience, age and education?
What are the cases, and how many are there?
Each case represents a worker’s demographic data and their employment information . There are 6704 observations in the dataset.
Describe the method of data collection.
The data was collected through different industries and regions across the globe. It was sourced through employment websites and surveys. The data contains information about age, gender, education, job titles, years of experience, salary, race and geographic location.
What type of study is this (observational/experiment)?
This information was received through observational.
If you collected the data, state self-collected. If not, provide a citation/link.
Data was collected by Kaggle: Salary by Job Title and Country.
What is the response variable? Is it quantitative or qualitative?
The response variable is salary which is quantitative and numeric.
The explanatory variable are :
Gender (qualitative and categorical)
Education (qualitative and categorical)
Years of experience (quantitative and numeric)
Age (quantitative and numeric)
Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
##filter for only female and male (ignoring Others)
gender <- salary %>%
filter(gender == "Female" | gender == "Male")
gender
## # A tibble: 6,684 × 9
## age gender education job_title yrs_experience salary yrs degree range
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <fct> <fct> <fct>
## 1 32 Male Bachelor's Software En… 5 90000 0-5 Bache… 0-5
## 2 28 Female Master's Data Analyst 3 65000 0-5 Master 0-5
## 3 45 Male PhD Senior Mana… 15 150000 11-15 PhD 11-15
## 4 36 Female Bachelor's Sales Assoc… 7 60000 6-10 Bache… 6-10
## 5 52 Male Master's Director 20 200000 16-20 Master 16-20
## 6 29 Male Bachelor's Marketing A… 2 55000 0-5 Bache… 0-5
## 7 42 Female Master's Product Man… 12 120000 11-15 Master 11-15
## 8 31 Male Bachelor's Sales Manag… 4 80000 0-5 Bache… 0-5
## 9 26 Female Bachelor's Marketing C… 1 45000 0-5 Bache… 0-5
## 10 38 Male PhD Senior Scie… 10 110000 6-10 PhD 6-10
## # ℹ 6,674 more rows
Salary for Female and Male
#observe the salary difference in mean between male and female
diff_mean <- gender %>%
group_by(gender) %>%
summarize(mean_salary = mean(salary)) %>%
pull() %>%
diff()
diff_mean
## [1] 13506.7
#average salary for female and male only
avg <- gender %>%
summarize(mean = mean(salary))
avg
## # A tibble: 1 × 1
## mean
## <dbl>
## 1 115307.
# filter for female only
female <- gender %>%
filter(gender == "Female") %>%
select(salary)
# SD and mean for female
sdfm <- sd(gender$salary)
meanfm <- mean(gender$salary)
# plot to see if it looks like a normal distribution
ggplot(data = female, aes(x = salary)) + geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = meanfm, sd = sdfm), col = "tomato") +
xlab("Female Salaries")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#filter for male only
male <- gender %>%
filter(gender == "Male")
#SD and mean for male
sdm <- sd(gender$salary)
meanm <- mean(gender$salary)
#Plot to see if it looks like a normal distribution
ggplot(data = male, aes(x = salary)) + geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = meanm, sd = sdm), col = "tomato") +
xlab("Male salaries")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#qqplot for female and male
qqnormsim(sample = salary, data = female)
qqnormsim(sample = salary, data = male)
# avg of gender
box <- gender %>%
group_by(gender) %>%
summarize(mean = round(mean(salary)))
box
## # A tibble: 2 × 2
## gender mean
## <chr> <dbl>
## 1 Female 107889
## 2 Male 121396
# boxplot data
text <- gender %>%
group_by(gender) %>%
summarize(median = round(median(salary)), mean = round(mean(salary)), min = min(salary), max = max(salary), q1 = quantile(salary, .25), q3 = quantile(salary, .75), iqr = q3-q1)
text
## # A tibble: 2 × 8
## gender median mean min max q1 q3 iqr
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Female 105000 107889 500 220000 60000 150000 90000
## 2 Male 120000 121396 350 250000 75000 170000 95000
text$labs = c()
#boxplot
ggplot(gender, aes(gender, salary)) +
geom_boxplot() +
geom_point(data = box, aes(gender, y = mean),color ="blue", size = 4) +
geom_text(data = text, aes(gender, y = median, label = median, vjust = -2)) +
geom_text(data = text, aes(gender, y = max, label = max, vjust = -.5)) +
geom_text(data = text, aes(gender, y = min, label = min, vjust = -.1)) +
geom_text(data = text, aes(gender, y = q1, label = q1, vjust = -.3)) +
geom_text(data = text, aes(gender, y = mean, label = mean, vjust = -.3)) +
geom_text(data = text, aes(gender, y = q3, label = q3, vjust = -.5))
#density plot
ggplot(gender, aes(x=salary, color = gender)) +
geom_density()
#Filter for female
sum <- gender %>%
filter(gender == "Female") %>%
summarize(total = median(salary))
sum
## # A tibble: 1 × 1
## total
## <dbl>
## 1 105000
#quartile for 25% and 75%
q1 <- quantile(sum$total, .25)
q3 <- quantile(sum$total, .75)
iqr <- q3 - q1
# filter for male
sum2 <- gender %>%
filter(gender == "Male") %>%
summarize(total = median(salary))
sum2
## # A tibble: 1 × 1
## total
## <dbl>
## 1 120000
#quartile for 25% and 75%
q12 <- quantile(sum$total, .25)
q32 <- quantile(sum$total, .75)
iqr2 <- q3 - q1
Years of Experience and Salary Analysis
#group years of experience
gender %>%
group_by(yrs) %>%
summarize( x_bar =mean(salary),sd =sd(salary),n=n())
## # A tibble: 5 × 4
## yrs x_bar sd n
## <fct> <dbl> <dbl> <int>
## 1 0-5 69059. 32416. 2802
## 2 6-10 125083. 33169. 1785
## 3 11-15 157209. 25973. 1203
## 4 16-20 183362. 20403. 656
## 5 20+ 187091. 21137. 238
#plot the data points
ggplot(gender, aes(x = salary, y = yrs_experience)) +
geom_point() +
geom_smooth(method ="lm", formula= y ~x, color = "darkgreen", se = FALSE)
#bar chart for salary and years of experience
ggplot(gender, aes(yrs, salary)) +
geom_bar(stat = "identity")
# correlation for salary and experience
cor(gender$salary, gender$yrs_experience)
## [1] 0.8109416
ex_lm <- lm(salary~yrs_experience, data = gender)
summary(ex_lm)
##
## Call:
## lm(formula = salary ~ yrs_experience, data = gender)
##
## Residuals:
## Min 1Q Median 3Q Max
## -149561 -22140 -5780 20527 100757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57935.01 631.92 91.68 <2e-16 ***
## yrs_experience 7102.52 62.69 113.29 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30900 on 6682 degrees of freedom
## Multiple R-squared: 0.6576, Adjusted R-squared: 0.6576
## F-statistic: 1.283e+04 on 1 and 6682 DF, p-value: < 2.2e-16
Education and Salary Analysis
gender %>%
group_by(degree) %>%
summarize( x_bar =mean(salary),sd =sd(salary),n=n())
## # A tibble: 4 × 4
## degree x_bar sd n
## <fct> <dbl> <dbl> <int>
## 1 High School 34416. 16563. 436
## 2 Bachelor 95083. 44092. 3021
## 3 Master 130078. 40650. 1858
## 4 PhD 165651. 34340. 1369
# Mean for each degree
deg_mean <- gender %>%
group_by(degree) %>%
summarize(mean=mean(salary))
deg_mean
## # A tibble: 4 × 2
## degree mean
## <fct> <dbl>
## 1 High School 34416.
## 2 Bachelor 95083.
## 3 Master 130078.
## 4 PhD 165651.
# plot boxplot for degree and salary with mean
ggplot(gender, aes(degree, salary)) +
geom_boxplot() +
geom_point(data = deg_mean, aes(degree, y = mean), color ="blue", size = 2)
#Anova for degree and salary
degree_aov <- aov(salary ~ degree, data = gender)
degree_aov
## Call:
## aov(formula = salary ~ degree, data = gender)
##
## Terms:
## degree Residuals
## Sum of Squares 7.963783e+12 1.067216e+13
## Deg. of Freedom 3 6680
##
## Residual standard error: 39970.35
## Estimated effects may be unbalanced
summary(degree_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## degree 3 7.964e+12 2.655e+12 1662 <2e-16 ***
## Residuals 6680 1.067e+13 1.598e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Age and Salary Analysis
gender %>%
summarize( x_bar =mean(salary),sd =sd(salary),n=n())
## # A tibble: 1 × 3
## x_bar sd n
## <dbl> <dbl> <int>
## 1 115307. 52807. 6684
# average for each age in salary
age_mean <- gender %>%
summarize(mean=mean(salary))
age_mean
## # A tibble: 1 × 1
## mean
## <dbl>
## 1 115307.
# scatter plot for age and salary
ggplot(gender, aes(x=age, salary)) +
geom_point() +
geom_smooth(method ="lm", formula= y ~x, color = "red", se = FALSE)
# correlation for salary and age
cor(gender$salary, gender$age)
## [1] 0.7283429
age_lm <- lm(salary ~ age, data = gender)
summary(age_lm)
##
## Call:
## lm(formula = salary ~ age, data = gender)
##
## Residuals:
## Min 1Q Median 3Q Max
## -112287 -26899 -6645 22150 98165
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -54876.15 2008.02 -27.33 <2e-16 ***
## age 5063.39 58.27 86.89 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36190 on 6682 degrees of freedom
## Multiple R-squared: 0.5305, Adjusted R-squared: 0.5304
## F-statistic: 7550 on 1 and 6682 DF, p-value: < 2.2e-16
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:openintro':
##
## tips
plot <- gender %>%
select(salary, gender, yrs_experience, education)
ggpairs(plot)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
lm_out <- lm(salary ~ gender + yrs_experience + degree, data = gender)
summary(lm_out)
##
## Call:
## lm(formula = salary ~ gender + yrs_experience + degree, data = gender)
##
## Residuals:
## Min 1Q Median 3Q Max
## -133794 -19803 -4199 14412 90640
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23414.6 1403.0 16.689 < 2e-16 ***
## genderMale 5414.6 718.3 7.538 5.42e-14 ***
## yrs_experience 5784.7 73.8 78.383 < 2e-16 ***
## degreeBachelor 37050.4 1494.1 24.798 < 2e-16 ***
## degreeMaster 48591.3 1634.0 29.738 < 2e-16 ***
## degreePhD 58288.5 1816.0 32.098 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28560 on 6678 degrees of freedom
## Multiple R-squared: 0.7078, Adjusted R-squared: 0.7075
## F-statistic: 3235 on 5 and 6678 DF, p-value: < 2.2e-16