url <- "https://raw.githubusercontent.com/geeman1209/MSDATA2020/master/DATA607/Project%202/Dataset3/gender_educational_attainment.csv"
rD <- read.csv(url, header = FALSE, stringsAsFactors = FALSE, na.strings = c("", "NA"))
##Get a quick overview of the downloaded content
#View summary of data
summary(rD)
## V1 V2 V3 V4
## Length:6 Length:6 Length:6 Length:6
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## V5 V6 V7 V8
## Length:6 Length:6 Length:6 Length:6
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## V9 V10 V11
## Length:6 Length:6 Length:6
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
#View head of data
head(rD)
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 <NA> Male Male Male Male Male Female Female
## 2 <NA> No HS HS Associate Bachelor's Graduate No HS HS
## 3 Northeast Region 24801 36714 44765 65177 89449 16523 24251
## 4 Midwest Region 23530 33606 41180 59243 77325 15471 21713
## 5 South Region 22065 31552 40097 59711 81744 14659 21530
## 6 West Region 22554 32289 42127 62069 86476 15250 23230
## V9 V10 V11
## 1 Female Female Female
## 2 Associate Bachelor's Graduate
## 3 31455 45593 61203
## 4 26944 39534 54246
## 5 27243 41210 53100
## 6 29402 42527 57914
#View structure of data
str(rD)
## 'data.frame': 6 obs. of 11 variables:
## $ V1 : chr NA NA "Northeast Region" "Midwest Region" ...
## $ V2 : chr "Male" "No HS" "24801" "23530" ...
## $ V3 : chr "Male" "HS" "36714" "33606" ...
## $ V4 : chr "Male" "Associate" "44765" "41180" ...
## $ V5 : chr "Male" "Bachelor's" "65177" "59243" ...
## $ V6 : chr "Male" "Graduate" "89449" "77325" ...
## $ V7 : chr "Female" "No HS" "16523" "15471" ...
## $ V8 : chr "Female" "HS" "24251" "21713" ...
## $ V9 : chr "Female" "Associate" "31455" "26944" ...
## $ V10: chr "Female" "Bachelor's" "45593" "39534" ...
## $ V11: chr "Female" "Graduate" "61203" "54246" ...
##Using Dplyr
glimpse(rD)
## Observations: 6
## Variables: 11
## $ V1 <chr> NA, NA, "Northeast Region", "Midwest Region", "South Region", "...
## $ V2 <chr> "Male", "No HS", "24801", "23530", "22065", "22554"
## $ V3 <chr> "Male", "HS", "36714", "33606", "31552", "32289"
## $ V4 <chr> "Male", "Associate", "44765", "41180", "40097", "42127"
## $ V5 <chr> "Male", "Bachelor's", "65177", "59243", "59711", "62069"
## $ V6 <chr> "Male", "Graduate", "89449", "77325", "81744", "86476"
## $ V7 <chr> "Female", "No HS", "16523", "15471", "14659", "15250"
## $ V8 <chr> "Female", "HS", "24251", "21713", "21530", "23230"
## $ V9 <chr> "Female", "Associate", "31455", "26944", "27243", "29402"
## $ V10 <chr> "Female", "Bachelor's", "45593", "39534", "41210", "42527"
## $ V11 <chr> "Female", "Graduate", "61203", "54246", "53100", "57914"
#Column Names
names(rD)
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11"
I want to get rid of the double header and merge them into one. This eliminates 2 rows and gets rid of the NA values in the first two rows in the Region column.
Compare the data long vs. wide.
The wide format is easier to read.
colnames(rD) <- c("Region", "Male_NoHS", "Male_HS", "Male_Assoc", "Male_Bachelors", "Male_Graduate", "Female_NoHS", "Female_HS", "Female_Assoc", "Female_Bachelors", "Female_Graduate")
rd2 <- rD[-c(1,2),]
rd3 <- gather(rd2, "Education_Gender", "Salary", 1:11, -c(Region))
head(rd2)
## Region Male_NoHS Male_HS Male_Assoc Male_Bachelors Male_Graduate
## 3 Northeast Region 24801 36714 44765 65177 89449
## 4 Midwest Region 23530 33606 41180 59243 77325
## 5 South Region 22065 31552 40097 59711 81744
## 6 West Region 22554 32289 42127 62069 86476
## Female_NoHS Female_HS Female_Assoc Female_Bachelors Female_Graduate
## 3 16523 24251 31455 45593 61203
## 4 15471 21713 26944 39534 54246
## 5 14659 21530 27243 41210 53100
## 6 15250 23230 29402 42527 57914
str(rd2)
## 'data.frame': 4 obs. of 11 variables:
## $ Region : chr "Northeast Region" "Midwest Region" "South Region" "West Region"
## $ Male_NoHS : chr "24801" "23530" "22065" "22554"
## $ Male_HS : chr "36714" "33606" "31552" "32289"
## $ Male_Assoc : chr "44765" "41180" "40097" "42127"
## $ Male_Bachelors : chr "65177" "59243" "59711" "62069"
## $ Male_Graduate : chr "89449" "77325" "81744" "86476"
## $ Female_NoHS : chr "16523" "15471" "14659" "15250"
## $ Female_HS : chr "24251" "21713" "21530" "23230"
## $ Female_Assoc : chr "31455" "26944" "27243" "29402"
## $ Female_Bachelors: chr "45593" "39534" "41210" "42527"
## $ Female_Graduate : chr "61203" "54246" "53100" "57914"
summary(rd2)
## Region Male_NoHS Male_HS Male_Assoc
## Length:4 Length:4 Length:4 Length:4
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Male_Bachelors Male_Graduate Female_NoHS Female_HS
## Length:4 Length:4 Length:4 Length:4
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Female_Assoc Female_Bachelors Female_Graduate
## Length:4 Length:4 Length:4
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
Calculate the average incomes by gender and education level across.
Convert all the characters that should be numeric to numeric.
Create 2 data frames with all the average incomes.
#convert character values to numeric
rd2$Male_NoHS <- as.numeric(rd2$Male_NoHS)
rd2$Male_HS <- as.numeric(rd2$Male_HS)
rd2$Male_Assoc <- as.numeric(rd2$Male_Assoc)
rd2$Male_Bachelors <- as.numeric(rd2$Male_Bachelors)
rd2$Male_Graduate <- as.numeric(rd2$Male_Graduate)
#calculate averages for income by educational level
no_hs_male<- mean(rd2$Male_NoHS)
male_hs <- mean(rd2$Male_HS)
Male_Assoc <- mean(rd2$Male_Assoc)
male_bachelor <- mean(rd2$Male_Bachelors)
male_graduate <- mean(rd2$Male_Graduate)
AvgDF <- data.frame(no_hs_male, male_hs, Male_Assoc, male_bachelor, male_graduate)
#convert to numeric
rd2$Female_NoHS <- as.numeric(rd2$Female_NoHS)
rd2$Female_HS <- as.numeric(rd2$Female_HS)
rd2$Female_Assoc <- as.numeric(rd2$Female_Assoc)
rd2$Female_Bachelors <- as.numeric(rd2$Female_Bachelors)
rd2$Female_Graduate <- as.numeric(rd2$Female_Graduate)
#Calculate averages per educational levels
no_hs_female<- mean(rd2$Female_NoHS)
female_hs <- mean(rd2$Female_HS)
female_Assoc <- mean(rd2$Female_Assoc)
female_bachelor <- mean(rd2$Female_Bachelors)
female_graduate <- mean(rd2$Female_Graduate)
AvgDF2 <- data.frame(no_hs_female, female_hs, female_Assoc, female_bachelor, female_graduate)
#Difference in income between genders/educational levels
DiffinAvg <- AvgDF-AvgDF2
colnames(DiffinAvg) <- c("M_F_NoHS", "M_F_HS", "M_F_Assoc", "M_F_Bachelors", "M_F_Grad")
rownames(DiffinAvg) <- "Income_Diff"
DiffinAvg <- DiffinAvg %>% gather("M_F_NoHS:M_F_Grad", Income_Diff, M_F_NoHS:M_F_Grad)
#made a mistake with the column name and changing it
colnames(DiffinAvg)[1] <- "Gender_Education"
print(DiffinAvg)
## Gender_Education Income_Diff
## 1 M_F_NoHS 7761.75
## 2 M_F_HS 10859.25
## 3 M_F_Assoc 13281.25
## 4 M_F_Bachelors 19334.00
## 5 M_F_Grad 27132.75
In the table above, I needed to rearrange the data into columns so I could graph the income difference between Men and Women across different educational levels.
Below, I added the incomes for those with No High School, High School, and Associates into a column called Low Education (by gender) and added the incomes of those who acquired a Bachelor’s and Graduate degree into High Education column (also by gender). The rows are still arranged by region so it is possible to see the income differences between a male with a “high education level” from the Northeast versus a male who lives in the south with a “low education level”. The same logic can applied to women as well.
I then took the difference low education and high education per region. Which yielded some interesting results for women.
rd4 <- tibble(rd2$Region)
rd2 %>% select(Male_NoHS:Male_Assoc) %>% rowSums(na.rm=TRUE) -> rd4$Male_LowEdu_Total
rd2 %>% select(Male_Bachelors:Male_Graduate) %>% rowSums(na.rm=TRUE) -> rd4$Male_HighEdu_Total
rd4$Male_LowEdu_Total - rd4$Male_HighEdu_Total -> rd4$Male_IncDiff
rd2 %>% select(Female_NoHS:Female_Assoc) %>% rowSums(na.rm=TRUE) -> rd4$Female_LowEdu_Total
rd2 %>% select(Female_Bachelors:Female_Graduate) %>% rowSums(na.rm=TRUE) -> rd4$Female_HighEdu_Total
rd4$Female_LowEdu_Total - rd4$Female_HighEdu_Total-> rd4$Female_IncDiff
library(knitr)
kable(rd4)
| rd2$Region | Male_LowEdu_Total | Male_HighEdu_Total | Male_IncDiff | Female_LowEdu_Total | Female_HighEdu_Total | Female_IncDiff |
|---|---|---|---|---|---|---|
| Northeast Region | 106280 | 154626 | -48346 | 72229 | 106796 | -34567 |
| Midwest Region | 98316 | 136568 | -38252 | 64128 | 93780 | -29652 |
| South Region | 93714 | 141455 | -47741 | 63432 | 94310 | -30878 |
| West Region | 96970 | 148545 | -51575 | 67882 | 100441 | -32559 |
| ## Bar Graph |
ggplot(DiffinAvg, aes(x=Gender_Education, y=Income_Diff)) + geom_bar(stat = "identity")
The difference in education and income levels between genders is significant and large, every increasing income gaps as educational level increases. This is surprising to me as it is commonly perceived that education is an equalizer in pay, especially attaining higher education levels. However, the data doesn’t specify what degrees are being acquired per gender. Some degrees are associated with higher degrees of pay such as computer science, engineering, law, medicine, etc. So a closer look into which degrees are being attained per gender is worthwhile.
Secondly, comparing the “high education” totals for women and viewing the “low education” totals for men, the difference isn’t that significant across regions.