Project2_Dataset3

Project 2 Dataset 3

url <- "https://raw.githubusercontent.com/geeman1209/MSDATA2020/master/DATA607/Project%202/Dataset3/gender_educational_attainment.csv"

rD <- read.csv(url, header = FALSE, stringsAsFactors = FALSE, na.strings = c("", "NA"))


##Get a quick overview of the downloaded content

#View summary of data
summary(rD)

##       V1                 V2                 V3                 V4           
##  Length:6           Length:6           Length:6           Length:6          
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##       V5                 V6                 V7                 V8           
##  Length:6           Length:6           Length:6           Length:6          
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##       V9                V10                V11           
##  Length:6           Length:6           Length:6          
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character

#View head of data
head(rD)

##                 V1    V2    V3        V4         V5       V6     V7     V8
## 1             <NA>  Male  Male      Male       Male     Male Female Female
## 2             <NA> No HS    HS Associate Bachelor's Graduate  No HS     HS
## 3 Northeast Region 24801 36714     44765      65177    89449  16523  24251
## 4   Midwest Region 23530 33606     41180      59243    77325  15471  21713
## 5     South Region 22065 31552     40097      59711    81744  14659  21530
## 6      West Region 22554 32289     42127      62069    86476  15250  23230
##          V9        V10      V11
## 1    Female     Female   Female
## 2 Associate Bachelor's Graduate
## 3     31455      45593    61203
## 4     26944      39534    54246
## 5     27243      41210    53100
## 6     29402      42527    57914

#View structure of data
str(rD)

## 'data.frame':    6 obs. of  11 variables:
##  $ V1 : chr  NA NA "Northeast Region" "Midwest Region" ...
##  $ V2 : chr  "Male" "No HS" "24801" "23530" ...
##  $ V3 : chr  "Male" "HS" "36714" "33606" ...
##  $ V4 : chr  "Male" "Associate" "44765" "41180" ...
##  $ V5 : chr  "Male" "Bachelor's" "65177" "59243" ...
##  $ V6 : chr  "Male" "Graduate" "89449" "77325" ...
##  $ V7 : chr  "Female" "No HS" "16523" "15471" ...
##  $ V8 : chr  "Female" "HS" "24251" "21713" ...
##  $ V9 : chr  "Female" "Associate" "31455" "26944" ...
##  $ V10: chr  "Female" "Bachelor's" "45593" "39534" ...
##  $ V11: chr  "Female" "Graduate" "61203" "54246" ...

##Using Dplyr
glimpse(rD)

## Observations: 6
## Variables: 11
## $ V1  <chr> NA, NA, "Northeast Region", "Midwest Region", "South Region", "...
## $ V2  <chr> "Male", "No HS", "24801", "23530", "22065", "22554"
## $ V3  <chr> "Male", "HS", "36714", "33606", "31552", "32289"
## $ V4  <chr> "Male", "Associate", "44765", "41180", "40097", "42127"
## $ V5  <chr> "Male", "Bachelor's", "65177", "59243", "59711", "62069"
## $ V6  <chr> "Male", "Graduate", "89449", "77325", "81744", "86476"
## $ V7  <chr> "Female", "No HS", "16523", "15471", "14659", "15250"
## $ V8  <chr> "Female", "HS", "24251", "21713", "21530", "23230"
## $ V9  <chr> "Female", "Associate", "31455", "26944", "27243", "29402"
## $ V10 <chr> "Female", "Bachelor's", "45593", "39534", "41210", "42527"
## $ V11 <chr> "Female", "Graduate", "61203", "54246", "53100", "57914"

#Column Names 
names(rD)

##  [1] "V1"  "V2"  "V3"  "V4"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10" "V11"

I want to get rid of the double header and merge them into one. This eliminates 2 rows and gets rid of the NA values in the first two rows in the Region column.

Compare the data long vs. wide.

The wide format is easier to read.

colnames(rD) <- c("Region", "Male_NoHS", "Male_HS", "Male_Assoc", "Male_Bachelors", "Male_Graduate", "Female_NoHS", "Female_HS", "Female_Assoc", "Female_Bachelors", "Female_Graduate")

rd2 <- rD[-c(1,2),]

rd3 <- gather(rd2, "Education_Gender", "Salary", 1:11, -c(Region))


head(rd2)

##             Region Male_NoHS Male_HS Male_Assoc Male_Bachelors Male_Graduate
## 3 Northeast Region     24801   36714      44765          65177         89449
## 4   Midwest Region     23530   33606      41180          59243         77325
## 5     South Region     22065   31552      40097          59711         81744
## 6      West Region     22554   32289      42127          62069         86476
##   Female_NoHS Female_HS Female_Assoc Female_Bachelors Female_Graduate
## 3       16523     24251        31455            45593           61203
## 4       15471     21713        26944            39534           54246
## 5       14659     21530        27243            41210           53100
## 6       15250     23230        29402            42527           57914

str(rd2)

## 'data.frame':    4 obs. of  11 variables:
##  $ Region          : chr  "Northeast Region" "Midwest Region" "South Region" "West Region"
##  $ Male_NoHS       : chr  "24801" "23530" "22065" "22554"
##  $ Male_HS         : chr  "36714" "33606" "31552" "32289"
##  $ Male_Assoc      : chr  "44765" "41180" "40097" "42127"
##  $ Male_Bachelors  : chr  "65177" "59243" "59711" "62069"
##  $ Male_Graduate   : chr  "89449" "77325" "81744" "86476"
##  $ Female_NoHS     : chr  "16523" "15471" "14659" "15250"
##  $ Female_HS       : chr  "24251" "21713" "21530" "23230"
##  $ Female_Assoc    : chr  "31455" "26944" "27243" "29402"
##  $ Female_Bachelors: chr  "45593" "39534" "41210" "42527"
##  $ Female_Graduate : chr  "61203" "54246" "53100" "57914"

summary(rd2)

##     Region           Male_NoHS           Male_HS           Male_Assoc       
##  Length:4           Length:4           Length:4           Length:4          
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Male_Bachelors     Male_Graduate      Female_NoHS         Female_HS        
##  Length:4           Length:4           Length:4           Length:4          
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Female_Assoc       Female_Bachelors   Female_Graduate   
##  Length:4           Length:4           Length:4          
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character

Calculate the average incomes by gender and education level across.

Convert all the characters that should be numeric to numeric.

Create 2 data frames with all the average incomes.

#convert character values to numeric
rd2$Male_NoHS <- as.numeric(rd2$Male_NoHS)
rd2$Male_HS <- as.numeric(rd2$Male_HS)
rd2$Male_Assoc <- as.numeric(rd2$Male_Assoc)
rd2$Male_Bachelors <- as.numeric(rd2$Male_Bachelors)
rd2$Male_Graduate <- as.numeric(rd2$Male_Graduate)

#calculate averages for income by educational level
no_hs_male<- mean(rd2$Male_NoHS)
male_hs <- mean(rd2$Male_HS)
Male_Assoc <- mean(rd2$Male_Assoc)
male_bachelor <- mean(rd2$Male_Bachelors)
male_graduate <- mean(rd2$Male_Graduate)

AvgDF <- data.frame(no_hs_male, male_hs, Male_Assoc, male_bachelor, male_graduate)

#convert to numeric
rd2$Female_NoHS <- as.numeric(rd2$Female_NoHS)
rd2$Female_HS <- as.numeric(rd2$Female_HS)
rd2$Female_Assoc <- as.numeric(rd2$Female_Assoc)
rd2$Female_Bachelors <- as.numeric(rd2$Female_Bachelors)
rd2$Female_Graduate <- as.numeric(rd2$Female_Graduate)

#Calculate averages per educational levels
no_hs_female<- mean(rd2$Female_NoHS)
female_hs <- mean(rd2$Female_HS)
female_Assoc <- mean(rd2$Female_Assoc)
female_bachelor <- mean(rd2$Female_Bachelors)
female_graduate <- mean(rd2$Female_Graduate)

AvgDF2 <- data.frame(no_hs_female, female_hs, female_Assoc, female_bachelor, female_graduate)

#Difference in income between genders/educational levels
DiffinAvg <- AvgDF-AvgDF2
colnames(DiffinAvg) <- c("M_F_NoHS", "M_F_HS", "M_F_Assoc", "M_F_Bachelors", "M_F_Grad")
rownames(DiffinAvg) <- "Income_Diff"

DiffinAvg <- DiffinAvg %>% gather("M_F_NoHS:M_F_Grad", Income_Diff, M_F_NoHS:M_F_Grad)

#made a mistake with the column name and changing it
colnames(DiffinAvg)[1] <- "Gender_Education"

print(DiffinAvg)

##   Gender_Education Income_Diff
## 1         M_F_NoHS     7761.75
## 2           M_F_HS    10859.25
## 3        M_F_Assoc    13281.25
## 4    M_F_Bachelors    19334.00
## 5         M_F_Grad    27132.75

In the table above, I needed to rearrange the data into columns so I could graph the income difference between Men and Women across different educational levels.

Below, I added the incomes for those with No High School, High School, and Associates into a column called Low Education (by gender) and added the incomes of those who acquired a Bachelor’s and Graduate degree into High Education column (also by gender). The rows are still arranged by region so it is possible to see the income differences between a male with a “high education level” from the Northeast versus a male who lives in the south with a “low education level”. The same logic can applied to women as well.

I then took the difference low education and high education per region. Which yielded some interesting results for women.

rd4 <- tibble(rd2$Region)

rd2 %>% select(Male_NoHS:Male_Assoc) %>% rowSums(na.rm=TRUE) -> rd4$Male_LowEdu_Total
rd2 %>% select(Male_Bachelors:Male_Graduate) %>% rowSums(na.rm=TRUE) -> rd4$Male_HighEdu_Total
rd4$Male_LowEdu_Total - rd4$Male_HighEdu_Total -> rd4$Male_IncDiff


rd2 %>% select(Female_NoHS:Female_Assoc) %>% rowSums(na.rm=TRUE) -> rd4$Female_LowEdu_Total
rd2 %>% select(Female_Bachelors:Female_Graduate) %>% rowSums(na.rm=TRUE) -> rd4$Female_HighEdu_Total
rd4$Female_LowEdu_Total - rd4$Female_HighEdu_Total-> rd4$Female_IncDiff

library(knitr)
kable(rd4)

rd2$Region	Male_LowEdu_Total	Male_HighEdu_Total	Male_IncDiff	Female_LowEdu_Total	Female_HighEdu_Total	Female_IncDiff
Northeast Region	106280	154626	-48346	72229	106796	-34567
Midwest Region	98316	136568	-38252	64128	93780	-29652
South Region	93714	141455	-47741	63432	94310	-30878
West Region	96970	148545	-51575	67882	100441	-32559
## Bar Graph

ggplot(DiffinAvg, aes(x=Gender_Education, y=Income_Diff)) + geom_bar(stat = "identity")

Project2_Dataset3

Gabe Abreu

3/7/2020

Project 2 Dataset 3

Conclusion