library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
url = 'https://raw.githubusercontent.com/AlphaCurse/Occupational-Income/main/OccupationalIncome.csv'
df = read.csv(url)
head(df)
## ï..Occupation All_workers
## 1 Number of workers (thousands)
## 2 ALL OCCUPATIONS 109080
## 3 MANAGEMENT 12480
## 4 Chief executives 1046
## 5 General and operations managers 823
## 6 Legislators 8
## All_weekly M_workers
## 1 Median weekly income workers Numer of male workers (thousands)
## 2 809 60746
## 3 1,351 7332
## 4 2,041 763
## 5 1,260 621
## 6 <NA> 6
## M_weekly F_workers
## 1 Median weekly income males Numer of female workers (thousands)
## 2 895 48334
## 3 1,486 5147
## 4 2,251 283
## 5 1,347 202
## 6 <NA> 4
## F_weekly
## 1 Median weekly income female
## 2 726
## 3 1,139
## 4 1,836
## 5 1,002
## 6 <NA>
I will change the name of each column and eliminate the row specifying qualitity of data.
colnames(df)[1] = 'Occupation'
colnames(df)[2] = 'Workers'
colnames(df)[3] = 'Weekly_Income'
colnames(df)[4] = 'Male_Workers'
colnames(df)[5] = 'Male_Weekly_Income'
colnames(df)[6] = 'Female_Workers'
colnames(df)[7] = 'Female_Weekly_Income'
df = df %>%
filter(!row_number() %in% c(1))
head(df)
## Occupation Workers Weekly_Income Male_Workers
## 1 ALL OCCUPATIONS 109080 809 60746
## 2 MANAGEMENT 12480 1,351 7332
## 3 Chief executives 1046 2,041 763
## 4 General and operations managers 823 1,260 621
## 5 Legislators 8 <NA> 6
## 6 Advertising and promotions managers 55 1,050 29
## Male_Weekly_Income Female_Workers Female_Weekly_Income
## 1 895 48334 726
## 2 1,486 5147 1,139
## 3 2,251 283 1,836
## 4 1,347 202 1,002
## 5 <NA> 4 <NA>
## 6 <NA> 26 <NA>
I will change each column from chr to int while eliminating all commas in each column.
df$Weekly_Income = as.integer(gsub(",","",df$Weekly_Income))
df$Male_Workers = as.integer(gsub(",","",df$Male_Workers))
df$Male_Weekly_Income = as.integer(gsub(",","",df$Male_Weekly_Income))
df$Female_Workers = as.integer(gsub(",","",df$Female_Workers))
df$Female_Weekly_Income = as.integer(gsub(",","",df$Female_Weekly_Income))
head(df)
## Occupation Workers Weekly_Income Male_Workers
## 1 ALL OCCUPATIONS 109080 809 60746
## 2 MANAGEMENT 12480 1351 7332
## 3 Chief executives 1046 2041 763
## 4 General and operations managers 823 1260 621
## 5 Legislators 8 NA 6
## 6 Advertising and promotions managers 55 1050 29
## Male_Weekly_Income Female_Workers Female_Weekly_Income
## 1 895 48334 726
## 2 1486 5147 1139
## 3 2251 283 1836
## 4 1347 202 1002
## 5 NA 4 NA
## 6 NA 26 NA
To identify the difference in income between males and females, we can isolate the Occupation, Male_Weekly_Income and Female_Weekly_Income columns respectively.
new_df = df %>%
select(Occupation, Male_Weekly_Income, Female_Weekly_Income)
head(new_df)
## Occupation Male_Weekly_Income Female_Weekly_Income
## 1 ALL OCCUPATIONS 895 726
## 2 MANAGEMENT 1486 1139
## 3 Chief executives 2251 1836
## 4 General and operations managers 1347 1002
## 5 Legislators NA NA
## 6 Advertising and promotions managers NA NA
For consistency, any row with missing data will need to be removed.
new_df = new_df %>%
filter(!is.na(Male_Weekly_Income))
new_df = new_df %>%
filter(!is.na(Female_Weekly_Income))
new_df
## Occupation Male_Weekly_Income
## 1 ALL OCCUPATIONS 895
## 2 MANAGEMENT 1486
## 3 Chief executives 2251
## 4 General and operations managers 1347
## 5 Marketing and sales managers 1603
## 6 Administrative services managers 1451
## 7 Computer and information systems managers 1817
## 8 Financial managers 1732
## 9 Human resources managers 1495
## 10 Purchasing managers 1404
## 11 Transportation, storage, and distribution managers 1006
## Female_Weekly_Income
## 1 726
## 2 1139
## 3 1836
## 4 1002
## 5 1258
## 6 981
## 7 1563
## 8 1130
## 9 1274
## 10 1226
## 11 749
Now that the data is tidy and transformed, we can create a new column that specifies the difference between the two columns.
new_df$Difference = new_df$Male_Weekly_Income - new_df$Female_Weekly_Income
new_df
## Occupation Male_Weekly_Income
## 1 ALL OCCUPATIONS 895
## 2 MANAGEMENT 1486
## 3 Chief executives 2251
## 4 General and operations managers 1347
## 5 Marketing and sales managers 1603
## 6 Administrative services managers 1451
## 7 Computer and information systems managers 1817
## 8 Financial managers 1732
## 9 Human resources managers 1495
## 10 Purchasing managers 1404
## 11 Transportation, storage, and distribution managers 1006
## Female_Weekly_Income Difference
## 1 726 169
## 2 1139 347
## 3 1836 415
## 4 1002 345
## 5 1258 345
## 6 981 470
## 7 1563 254
## 8 1130 602
## 9 1274 221
## 10 1226 178
## 11 749 257
From the Difference column, we can see the difference between weekly income from males and females. The data shows males consistently make a higher weekly income than females. Additionally, I added a percentage difference in weekly income between the two genders. We can analyze the percentage amount males get paid more than females per occupation.
new_df$Total = new_df$Male_Weekly_Income + new_df$Female_Weekly_Income
new_df$Percentage = ((new_df$Male_Weekly_Income / new_df$Female_Weekly_Income)*100)-100
new_df
## Occupation Male_Weekly_Income
## 1 ALL OCCUPATIONS 895
## 2 MANAGEMENT 1486
## 3 Chief executives 2251
## 4 General and operations managers 1347
## 5 Marketing and sales managers 1603
## 6 Administrative services managers 1451
## 7 Computer and information systems managers 1817
## 8 Financial managers 1732
## 9 Human resources managers 1495
## 10 Purchasing managers 1404
## 11 Transportation, storage, and distribution managers 1006
## Female_Weekly_Income Difference Total Percentage
## 1 726 169 1621 23.27824
## 2 1139 347 2625 30.46532
## 3 1836 415 4087 22.60349
## 4 1002 345 2349 34.43114
## 5 1258 345 2861 27.42448
## 6 981 470 2432 47.91030
## 7 1563 254 3380 16.25080
## 8 1130 602 2862 53.27434
## 9 1274 221 2769 17.34694
## 10 1226 178 2630 14.51876
## 11 749 257 1755 34.31242