Import Libraries

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Read the Data

url = 'https://raw.githubusercontent.com/AlphaCurse/Occupational-Income/main/OccupationalIncome.csv'
df = read.csv(url)
head(df)
##                     ï..Occupation                   All_workers
## 1                                 Number of workers (thousands)
## 2                 ALL OCCUPATIONS                        109080
## 3                      MANAGEMENT                         12480
## 4                Chief executives                          1046
## 5 General and operations managers                           823
## 6                     Legislators                             8
##                     All_weekly                         M_workers
## 1 Median weekly income workers Numer of male workers (thousands)
## 2                          809                             60746
## 3                        1,351                              7332
## 4                        2,041                               763
## 5                        1,260                               621
## 6                         <NA>                                 6
##                     M_weekly                           F_workers
## 1 Median weekly income males Numer of female workers (thousands)
## 2                        895                               48334
## 3                      1,486                                5147
## 4                      2,251                                 283
## 5                      1,347                                 202
## 6                       <NA>                                   4
##                      F_weekly
## 1 Median weekly income female
## 2                         726
## 3                       1,139
## 4                       1,836
## 5                       1,002
## 6                        <NA>

Tidy and Transform Data

I will change the name of each column and eliminate the row specifying qualitity of data.

colnames(df)[1] = 'Occupation'
colnames(df)[2] = 'Workers'
colnames(df)[3] = 'Weekly_Income'
colnames(df)[4] = 'Male_Workers'
colnames(df)[5] = 'Male_Weekly_Income'
colnames(df)[6] = 'Female_Workers'
colnames(df)[7] = 'Female_Weekly_Income'
df = df %>%
  filter(!row_number() %in% c(1))
head(df)
##                            Occupation Workers Weekly_Income Male_Workers
## 1                     ALL OCCUPATIONS  109080           809        60746
## 2                          MANAGEMENT   12480         1,351         7332
## 3                    Chief executives    1046         2,041          763
## 4     General and operations managers     823         1,260          621
## 5                         Legislators       8          <NA>            6
## 6 Advertising and promotions managers      55         1,050           29
##   Male_Weekly_Income Female_Workers Female_Weekly_Income
## 1                895          48334                  726
## 2              1,486           5147                1,139
## 3              2,251            283                1,836
## 4              1,347            202                1,002
## 5               <NA>              4                 <NA>
## 6               <NA>             26                 <NA>

I will change each column from chr to int while eliminating all commas in each column.

df$Weekly_Income = as.integer(gsub(",","",df$Weekly_Income))
df$Male_Workers = as.integer(gsub(",","",df$Male_Workers))
df$Male_Weekly_Income = as.integer(gsub(",","",df$Male_Weekly_Income))
df$Female_Workers = as.integer(gsub(",","",df$Female_Workers))
df$Female_Weekly_Income = as.integer(gsub(",","",df$Female_Weekly_Income))
head(df)
##                            Occupation Workers Weekly_Income Male_Workers
## 1                     ALL OCCUPATIONS  109080           809        60746
## 2                          MANAGEMENT   12480          1351         7332
## 3                    Chief executives    1046          2041          763
## 4     General and operations managers     823          1260          621
## 5                         Legislators       8            NA            6
## 6 Advertising and promotions managers      55          1050           29
##   Male_Weekly_Income Female_Workers Female_Weekly_Income
## 1                895          48334                  726
## 2               1486           5147                 1139
## 3               2251            283                 1836
## 4               1347            202                 1002
## 5                 NA              4                   NA
## 6                 NA             26                   NA

Analysis comparing the median weekly incomes for 535 different occupations to identify the difference in income between males and females.

To identify the difference in income between males and females, we can isolate the Occupation, Male_Weekly_Income and Female_Weekly_Income columns respectively.

new_df = df %>%
  select(Occupation, Male_Weekly_Income, Female_Weekly_Income)
head(new_df)
##                            Occupation Male_Weekly_Income Female_Weekly_Income
## 1                     ALL OCCUPATIONS                895                  726
## 2                          MANAGEMENT               1486                 1139
## 3                    Chief executives               2251                 1836
## 4     General and operations managers               1347                 1002
## 5                         Legislators                 NA                   NA
## 6 Advertising and promotions managers                 NA                   NA

For consistency, any row with missing data will need to be removed.

new_df = new_df %>%
  filter(!is.na(Male_Weekly_Income))
new_df = new_df %>%
  filter(!is.na(Female_Weekly_Income))
new_df
##                                            Occupation Male_Weekly_Income
## 1                                     ALL OCCUPATIONS                895
## 2                                          MANAGEMENT               1486
## 3                                    Chief executives               2251
## 4                     General and operations managers               1347
## 5                        Marketing and sales managers               1603
## 6                    Administrative services managers               1451
## 7           Computer and information systems managers               1817
## 8                                  Financial managers               1732
## 9                            Human resources managers               1495
## 10                                Purchasing managers               1404
## 11 Transportation, storage, and distribution managers               1006
##    Female_Weekly_Income
## 1                   726
## 2                  1139
## 3                  1836
## 4                  1002
## 5                  1258
## 6                   981
## 7                  1563
## 8                  1130
## 9                  1274
## 10                 1226
## 11                  749

Now that the data is tidy and transformed, we can create a new column that specifies the difference between the two columns.

new_df$Difference = new_df$Male_Weekly_Income - new_df$Female_Weekly_Income
new_df
##                                            Occupation Male_Weekly_Income
## 1                                     ALL OCCUPATIONS                895
## 2                                          MANAGEMENT               1486
## 3                                    Chief executives               2251
## 4                     General and operations managers               1347
## 5                        Marketing and sales managers               1603
## 6                    Administrative services managers               1451
## 7           Computer and information systems managers               1817
## 8                                  Financial managers               1732
## 9                            Human resources managers               1495
## 10                                Purchasing managers               1404
## 11 Transportation, storage, and distribution managers               1006
##    Female_Weekly_Income Difference
## 1                   726        169
## 2                  1139        347
## 3                  1836        415
## 4                  1002        345
## 5                  1258        345
## 6                   981        470
## 7                  1563        254
## 8                  1130        602
## 9                  1274        221
## 10                 1226        178
## 11                  749        257

From the Difference column, we can see the difference between weekly income from males and females. The data shows males consistently make a higher weekly income than females. Additionally, I added a percentage difference in weekly income between the two genders. We can analyze the percentage amount males get paid more than females per occupation.

new_df$Total = new_df$Male_Weekly_Income + new_df$Female_Weekly_Income
new_df$Percentage = ((new_df$Male_Weekly_Income / new_df$Female_Weekly_Income)*100)-100
new_df
##                                            Occupation Male_Weekly_Income
## 1                                     ALL OCCUPATIONS                895
## 2                                          MANAGEMENT               1486
## 3                                    Chief executives               2251
## 4                     General and operations managers               1347
## 5                        Marketing and sales managers               1603
## 6                    Administrative services managers               1451
## 7           Computer and information systems managers               1817
## 8                                  Financial managers               1732
## 9                            Human resources managers               1495
## 10                                Purchasing managers               1404
## 11 Transportation, storage, and distribution managers               1006
##    Female_Weekly_Income Difference Total Percentage
## 1                   726        169  1621   23.27824
## 2                  1139        347  2625   30.46532
## 3                  1836        415  4087   22.60349
## 4                  1002        345  2349   34.43114
## 5                  1258        345  2861   27.42448
## 6                   981        470  2432   47.91030
## 7                  1563        254  3380   16.25080
## 8                  1130        602  2862   53.27434
## 9                  1274        221  2769   17.34694
## 10                 1226        178  2630   14.51876
## 11                  749        257  1755   34.31242