# set the environment
Packages <- c("dplyr", "tidyverse","treemap","RColorBrewer","highcharter","readr","plotly")
lapply(Packages, library, character.only = TRUE)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ stringr 1.4.0
## ✓ tidyr 1.1.3 ✓ forcats 0.5.1
## ✓ readr 1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## [[1]]
## [1] "dplyr" "stats" "graphics" "grDevices" "utils" "datasets"
## [7] "methods" "base"
##
## [[2]]
## [1] "forcats" "stringr" "purrr" "readr" "tidyr" "tibble"
## [7] "ggplot2" "tidyverse" "dplyr" "stats" "graphics" "grDevices"
## [13] "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "treemap" "forcats" "stringr" "purrr" "readr" "tidyr"
## [7] "tibble" "ggplot2" "tidyverse" "dplyr" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "RColorBrewer" "treemap" "forcats" "stringr" "purrr"
## [6] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [11] "dplyr" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[5]]
## [1] "highcharter" "RColorBrewer" "treemap" "forcats" "stringr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "dplyr" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[6]]
## [1] "highcharter" "RColorBrewer" "treemap" "forcats" "stringr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "dplyr" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[7]]
## [1] "plotly" "highcharter" "RColorBrewer" "treemap" "forcats"
## [6] "stringr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "dplyr" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
getwd()
## [1] "/Users/shenjiayuan/Desktop"
a = read.csv('GSS-Lab-9.17.csv')
str(a)
## 'data.frame': 2936 obs. of 10 variables:
## $ Year : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
## $ ID : int 2 16 19 20 21 24 29 32 33 36 ...
## $ Hours_wkd : chr "42" "42" "40" "40" ...
## $ Occupation : chr "Machinists" "Designers" "Elementary and middle school teachers" "Insurance claims and policy processing clerks" ...
## $ Age : chr "61" "33" "31" "43" ...
## $ Educ : chr "12" "13" "18" "13" ...
## $ Sex : chr "Male" "Female" "Male" "Male" ...
## $ Income : chr "$25000 or more" "$25000 or more" "$25000 or more" "$25000 or more" ...
## $ income_recode: int 25000 25000 25000 25000 25000 25000 25000 25000 25000 25000 ...
## $ educ_recode : chr "1_hs" "2_some_college" "4_Advanced" "2_some_college" ...
#clean the dataset
names(a) <- tolower(names(a))
names(a) <- gsub(" ","",names(a))
str(a)
## 'data.frame': 2936 obs. of 10 variables:
## $ year : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
## $ id : int 2 16 19 20 21 24 29 32 33 36 ...
## $ hours_wkd : chr "42" "42" "40" "40" ...
## $ occupation : chr "Machinists" "Designers" "Elementary and middle school teachers" "Insurance claims and policy processing clerks" ...
## $ age : chr "61" "33" "31" "43" ...
## $ educ : chr "12" "13" "18" "13" ...
## $ sex : chr "Male" "Female" "Male" "Male" ...
## $ income : chr "$25000 or more" "$25000 or more" "$25000 or more" "$25000 or more" ...
## $ income_recode: int 25000 25000 25000 25000 25000 25000 25000 25000 25000 25000 ...
## $ educ_recode : chr "1_hs" "2_some_college" "4_Advanced" "2_some_college" ...
# set the filter for 2016
q1f2016 = a %>%
filter(income_recode>0 & sex == 'Female'&year == 2016)%>%
summarize(tot_pay_f_2016 = mean(income_recode))
q1f2016
## tot_pay_f_2016
## 1 19199.27
q1m2016 = a %>%
filter(income_recode>0 & sex == 'Male'& year == 2016)%>%
summarize(tot_pay_m_2016 = mean(income_recode))
q1m2016
## tot_pay_m_2016
## 1 21204.46
# set the filter for 2018
q1f2018 = a %>%
filter(income_recode>0 & sex == 'Female'&year == 2018)%>%
summarize(tot_pay_f_2018 = mean(income_recode))
q1f2018
## tot_pay_f_2018
## 1 19304.41
q1m2018 = a %>%
filter(income_recode>0 & sex == 'Male'& year == 2018)%>%
summarize(tot_pay_m_2018 = mean(income_recode))
q1m2018
## tot_pay_m_2018
## 1 21651.14
q1 = a %>%
filter(income_recode>0 )%>%
arrange(desc(income_recode))
ggplot(data = q1) +
geom_bar(mapping = aes(x = income_recode, fill =sex),position = "dodge")+
xlab("Income") +
ylab("Frequency") +
ggtitle("Men, in general, earn more money than women?")

age = c(q1f2016,q1f2018,q1m2016,q1m2018)
d = data.frame(age)
d
## tot_pay_f_2016 tot_pay_f_2018 tot_pay_m_2016 tot_pay_m_2018
## 1 19199.27 19304.41 21204.46 21651.14
ggplot(data = q1) +
geom_bar(mapping = aes(x = income_recode, fill =sex),position = "dodge")+
facet_wrap(~educ_recode)+
xlab("Income") +
ylab("Frequency") +
ggtitle("Even women with similar education levels tend to have lower earnings than men?")

q2 = q1%>%
group_by(educ_recode,sex) %>%
summarize(tot_pay = mean(income_recode), median = median(income_recode), count = n())%>%
arrange(educ_recode)
## `summarise()` has grouped output by 'educ_recode'. You can override using the `.groups` argument.
q2
## # A tibble: 11 x 5
## # Groups: educ_recode [6]
## educ_recode sex tot_pay median count
## <chr> <chr> <dbl> <dbl> <int>
## 1 0 Female 15000 15000 2
## 2 0_no_1_hs Female 12669. 12500 124
## 3 0_no_1_hs Male 18778. 25000 153
## 4 1_hs Female 18012. 22500 376
## 5 1_hs Male 21348. 25000 405
## 6 2_some_college Female 18626. 25000 439
## 7 2_some_college Male 20763. 25000 348
## 8 3_Bachelors Female 21667. 25000 327
## 9 3_Bachelors Male 22798. 25000 260
## 10 4_Advanced Female 22294. 25000 253
## 11 4_Advanced Male 22763. 25000 209
q1$hours_wkd = as.numeric(q1$hours_wkd)
## Warning: NAs introduced by coercion
str(q1)
## 'data.frame': 2896 obs. of 10 variables:
## $ year : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
## $ id : int 2 16 19 20 21 24 29 32 33 36 ...
## $ hours_wkd : num 42 42 40 40 40 NA NA 50 40 40 ...
## $ occupation : chr "Machinists" "Designers" "Elementary and middle school teachers" "Insurance claims and policy processing clerks" ...
## $ age : chr "61" "33" "31" "43" ...
## $ educ : chr "12" "13" "18" "13" ...
## $ sex : chr "Male" "Female" "Male" "Male" ...
## $ income : chr "$25000 or more" "$25000 or more" "$25000 or more" "$25000 or more" ...
## $ income_recode: int 25000 25000 25000 25000 25000 25000 25000 25000 25000 25000 ...
## $ educ_recode : chr "1_hs" "2_some_college" "4_Advanced" "2_some_college" ...
q3 = q1%>%
group_by(sex,hours_wkd) %>%
summarize(tot_pay = mean(income_recode), median = median(income_recode), count = n())%>%
filter(hours_wkd != 'NA')%>%
arrange(hours_wkd)
## `summarise()` has grouped output by 'sex'. You can override using the `.groups` argument.
q3
## # A tibble: 143 x 5
## # Groups: sex [2]
## sex hours_wkd tot_pay median count
## <chr> <dbl> <dbl> <dbl> <int>
## 1 Female 1 24375 25000 4
## 2 Male 1 25000 25000 3
## 3 Female 2 25000 25000 2
## 4 Male 2 25000 25000 1
## 5 Female 3 18400 22500 5
## 6 Male 3 25000 25000 2
## 7 Female 4 2167. 500 3
## 8 Male 4 16833. 25000 3
## 9 Female 5 10625 8000 4
## 10 Male 5 18333. 22500 3
## # … with 133 more rows
ggplot(q3,aes(hours_wkd, tot_pay)) +
geom_point() +
facet_wrap(~sex)

ggtitle("Mortgage Debt Between 2003 and 2018")
## $title
## [1] "Mortgage Debt Between 2003 and 2018"
##
## attr(,"class")
## [1] "labels"
q8 = q1%>%
group_by(sex,occupation,educ_recode) %>%
summarize(tot_pay = mean(income_recode), median = median(income_recode), count = n())%>%
filter(count>10)%>%
arrange(occupation)
## `summarise()` has grouped output by 'sex', 'occupation'. You can override using the `.groups` argument.
q8
## # A tibble: 33 x 6
## # Groups: sex, occupation [24]
## sex occupation educ_recode tot_pay median count
## <chr> <chr> <chr> <dbl> <dbl> <int>
## 1 Male Carpenters 1_hs 20542. 25000 12
## 2 Female Cashiers 0_no_1_hs 9773. 7500 11
## 3 Female Cashiers 1_hs 14692. 12500 13
## 4 Female Childcare workers 2_some_colle… 10864. 12500 11
## 5 Male Construction laborers 1_hs 22591. 25000 11
## 6 Female Customer service representatives 2_some_colle… 15679. 17500 14
## 7 Male Driver/sales workers and truck dri… 1_hs 23400 25000 20
## 8 Male Driver/sales workers and truck dri… 2_some_colle… 20071. 25000 14
## 9 Female Elementary and middle school teach… 3_Bachelors 24333. 25000 15
## 10 Female Elementary and middle school teach… 4_Advanced 23421. 25000 19
## # … with 23 more rows
ggplot(data = q8) +
geom_point(mapping = aes(x = tot_pay,y=count, color =occupation),position = "dodge")+
facet_wrap(~sex)+
xlab("Income") +
ylab("Frequency") +
ggtitle("Even women with similar education levels and profession tend to have lower earnings than men?")
## Warning: Width not defined. Set with `position_dodge(width = ?)`
