# set the environment 
Packages <- c("dplyr", "tidyverse","treemap","RColorBrewer","highcharter","readr","plotly")

lapply(Packages, library, character.only = TRUE)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ stringr 1.4.0
## ✓ tidyr   1.1.3     ✓ forcats 0.5.1
## ✓ readr   1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## [[1]]
## [1] "dplyr"     "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [7] "methods"   "base"     
## 
## [[2]]
##  [1] "forcats"   "stringr"   "purrr"     "readr"     "tidyr"     "tibble"   
##  [7] "ggplot2"   "tidyverse" "dplyr"     "stats"     "graphics"  "grDevices"
## [13] "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "treemap"   "forcats"   "stringr"   "purrr"     "readr"     "tidyr"    
##  [7] "tibble"    "ggplot2"   "tidyverse" "dplyr"     "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "RColorBrewer" "treemap"      "forcats"      "stringr"      "purrr"       
##  [6] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [11] "dplyr"        "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "highcharter"  "RColorBrewer" "treemap"      "forcats"      "stringr"     
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "dplyr"        "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "highcharter"  "RColorBrewer" "treemap"      "forcats"      "stringr"     
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "dplyr"        "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "plotly"       "highcharter"  "RColorBrewer" "treemap"      "forcats"     
##  [6] "stringr"      "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "dplyr"        "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"
getwd()
## [1] "/Users/shenjiayuan/Desktop"
a = read.csv('GSS-Lab-9.17.csv')
str(a)
## 'data.frame':    2936 obs. of  10 variables:
##  $ Year         : int  2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
##  $ ID           : int  2 16 19 20 21 24 29 32 33 36 ...
##  $ Hours_wkd    : chr  "42" "42" "40" "40" ...
##  $ Occupation   : chr  "Machinists" "Designers" "Elementary and middle school teachers" "Insurance claims and policy processing clerks" ...
##  $ Age          : chr  "61" "33" "31" "43" ...
##  $ Educ         : chr  "12" "13" "18" "13" ...
##  $ Sex          : chr  "Male" "Female" "Male" "Male" ...
##  $ Income       : chr  "$25000 or more" "$25000 or more" "$25000 or more" "$25000 or more" ...
##  $ income_recode: int  25000 25000 25000 25000 25000 25000 25000 25000 25000 25000 ...
##  $ educ_recode  : chr  "1_hs" "2_some_college" "4_Advanced" "2_some_college" ...
#clean the dataset
names(a) <- tolower(names(a))
names(a) <- gsub(" ","",names(a))
str(a)
## 'data.frame':    2936 obs. of  10 variables:
##  $ year         : int  2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
##  $ id           : int  2 16 19 20 21 24 29 32 33 36 ...
##  $ hours_wkd    : chr  "42" "42" "40" "40" ...
##  $ occupation   : chr  "Machinists" "Designers" "Elementary and middle school teachers" "Insurance claims and policy processing clerks" ...
##  $ age          : chr  "61" "33" "31" "43" ...
##  $ educ         : chr  "12" "13" "18" "13" ...
##  $ sex          : chr  "Male" "Female" "Male" "Male" ...
##  $ income       : chr  "$25000 or more" "$25000 or more" "$25000 or more" "$25000 or more" ...
##  $ income_recode: int  25000 25000 25000 25000 25000 25000 25000 25000 25000 25000 ...
##  $ educ_recode  : chr  "1_hs" "2_some_college" "4_Advanced" "2_some_college" ...
# set the filter for 2016
q1f2016 = a %>%
  filter(income_recode>0 & sex == 'Female'&year == 2016)%>%
  summarize(tot_pay_f_2016 = mean(income_recode))
q1f2016
##   tot_pay_f_2016
## 1       19199.27
q1m2016 = a %>%
  filter(income_recode>0 & sex == 'Male'& year == 2016)%>%
  summarize(tot_pay_m_2016 = mean(income_recode))
q1m2016
##   tot_pay_m_2016
## 1       21204.46
# set the filter for 2018
q1f2018 = a %>%
  filter(income_recode>0 & sex == 'Female'&year == 2018)%>%
  summarize(tot_pay_f_2018 = mean(income_recode))
q1f2018
##   tot_pay_f_2018
## 1       19304.41
q1m2018 = a %>%
  filter(income_recode>0 & sex == 'Male'& year == 2018)%>%
  summarize(tot_pay_m_2018 = mean(income_recode))
q1m2018
##   tot_pay_m_2018
## 1       21651.14
q1 = a %>%
  filter(income_recode>0 )%>%
  arrange(desc(income_recode))
ggplot(data = q1) + 
  geom_bar(mapping = aes(x = income_recode, fill =sex),position = "dodge")+
  xlab("Income") +
  ylab("Frequency") +
  ggtitle("Men, in general, earn more money than women?")

age = c(q1f2016,q1f2018,q1m2016,q1m2018)
d = data.frame(age)
d
##   tot_pay_f_2016 tot_pay_f_2018 tot_pay_m_2016 tot_pay_m_2018
## 1       19199.27       19304.41       21204.46       21651.14
ggplot(data = q1) + 
  geom_bar(mapping = aes(x = income_recode, fill =sex),position = "dodge")+
  facet_wrap(~educ_recode)+
  xlab("Income") +
  ylab("Frequency") +
  ggtitle("Even women with similar education levels tend to have lower earnings than men?")

q2 = q1%>%
  group_by(educ_recode,sex) %>%
  summarize(tot_pay = mean(income_recode), median = median(income_recode), count = n())%>%
  arrange(educ_recode)
## `summarise()` has grouped output by 'educ_recode'. You can override using the `.groups` argument.
q2
## # A tibble: 11 x 5
## # Groups:   educ_recode [6]
##    educ_recode    sex    tot_pay median count
##    <chr>          <chr>    <dbl>  <dbl> <int>
##  1 0              Female  15000   15000     2
##  2 0_no_1_hs      Female  12669.  12500   124
##  3 0_no_1_hs      Male    18778.  25000   153
##  4 1_hs           Female  18012.  22500   376
##  5 1_hs           Male    21348.  25000   405
##  6 2_some_college Female  18626.  25000   439
##  7 2_some_college Male    20763.  25000   348
##  8 3_Bachelors    Female  21667.  25000   327
##  9 3_Bachelors    Male    22798.  25000   260
## 10 4_Advanced     Female  22294.  25000   253
## 11 4_Advanced     Male    22763.  25000   209
q1$hours_wkd = as.numeric(q1$hours_wkd)
## Warning: NAs introduced by coercion
str(q1)
## 'data.frame':    2896 obs. of  10 variables:
##  $ year         : int  2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
##  $ id           : int  2 16 19 20 21 24 29 32 33 36 ...
##  $ hours_wkd    : num  42 42 40 40 40 NA NA 50 40 40 ...
##  $ occupation   : chr  "Machinists" "Designers" "Elementary and middle school teachers" "Insurance claims and policy processing clerks" ...
##  $ age          : chr  "61" "33" "31" "43" ...
##  $ educ         : chr  "12" "13" "18" "13" ...
##  $ sex          : chr  "Male" "Female" "Male" "Male" ...
##  $ income       : chr  "$25000 or more" "$25000 or more" "$25000 or more" "$25000 or more" ...
##  $ income_recode: int  25000 25000 25000 25000 25000 25000 25000 25000 25000 25000 ...
##  $ educ_recode  : chr  "1_hs" "2_some_college" "4_Advanced" "2_some_college" ...
q3 = q1%>%
  group_by(sex,hours_wkd) %>%
  summarize(tot_pay = mean(income_recode), median = median(income_recode), count = n())%>%
  filter(hours_wkd != 'NA')%>%
  arrange(hours_wkd)
## `summarise()` has grouped output by 'sex'. You can override using the `.groups` argument.
q3
## # A tibble: 143 x 5
## # Groups:   sex [2]
##    sex    hours_wkd tot_pay median count
##    <chr>      <dbl>   <dbl>  <dbl> <int>
##  1 Female         1  24375   25000     4
##  2 Male           1  25000   25000     3
##  3 Female         2  25000   25000     2
##  4 Male           2  25000   25000     1
##  5 Female         3  18400   22500     5
##  6 Male           3  25000   25000     2
##  7 Female         4   2167.    500     3
##  8 Male           4  16833.  25000     3
##  9 Female         5  10625    8000     4
## 10 Male           5  18333.  22500     3
## # … with 133 more rows
  ggplot(q3,aes(hours_wkd, tot_pay)) +
  geom_point() +
  facet_wrap(~sex)

  ggtitle("Mortgage Debt Between 2003 and 2018")
## $title
## [1] "Mortgage Debt Between 2003 and 2018"
## 
## attr(,"class")
## [1] "labels"
q8 = q1%>%
  group_by(sex,occupation,educ_recode) %>%
  summarize(tot_pay = mean(income_recode), median = median(income_recode), count = n())%>%
  filter(count>10)%>%
  arrange(occupation)
## `summarise()` has grouped output by 'sex', 'occupation'. You can override using the `.groups` argument.
q8
## # A tibble: 33 x 6
## # Groups:   sex, occupation [24]
##    sex    occupation                          educ_recode   tot_pay median count
##    <chr>  <chr>                               <chr>           <dbl>  <dbl> <int>
##  1 Male   Carpenters                          1_hs           20542.  25000    12
##  2 Female Cashiers                            0_no_1_hs       9773.   7500    11
##  3 Female Cashiers                            1_hs           14692.  12500    13
##  4 Female Childcare workers                   2_some_colle…  10864.  12500    11
##  5 Male   Construction laborers               1_hs           22591.  25000    11
##  6 Female Customer service representatives    2_some_colle…  15679.  17500    14
##  7 Male   Driver/sales workers and truck dri… 1_hs           23400   25000    20
##  8 Male   Driver/sales workers and truck dri… 2_some_colle…  20071.  25000    14
##  9 Female Elementary and middle school teach… 3_Bachelors    24333.  25000    15
## 10 Female Elementary and middle school teach… 4_Advanced     23421.  25000    19
## # … with 23 more rows
ggplot(data = q8) + 
  geom_point(mapping = aes(x = tot_pay,y=count, color =occupation),position = "dodge")+
  facet_wrap(~sex)+
 
  xlab("Income") +
  ylab("Frequency") +
  ggtitle("Even women with similar education levels and profession tend to have lower earnings than men?")
## Warning: Width not defined. Set with `position_dodge(width = ?)`