load(file="cps.RData")
library(ggplot2)
attach(cps)
str(cps)
## Classes 'tbl_df', 'tbl' and 'data.frame':    297222 obs. of  13 variables:
##  $ year       : atomic  1983 1983 1983 1983 1983 ...
##   ..- attr(*, "label")= chr "Survey year"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ state      : atomic  Pennsylvania Pennsylvania New Jersey New Jersey ...
##   ..- attr(*, "label")= chr "State (FIPS code)"
##   ..- attr(*, "format.stata")= chr "%12s"
##  $ age        :Class 'labelled'  atomic [1:297222] 40 72 27 88 24 26 65 22 72 16 ...
##   .. ..- attr(*, "label")= chr "Age"
##   .. ..- attr(*, "format.stata")= chr "%19.0g"
##   .. ..- attr(*, "labels")= Named num [1:100] 0 1 2 3 4 5 6 7 8 9 ...
##   .. .. ..- attr(*, "names")= chr [1:100] "Under 1 year" "1" "2" "3" ...
##  $ gender     : atomic  Male Male Male Female ...
##   ..- attr(*, "label")= chr "Sex"
##   ..- attr(*, "format.stata")= chr "%9s"
##  $ educ       :Class 'labelled'  atomic [1:297222] 110 72 72 50 110 73 110 80 72 40 ...
##   .. ..- attr(*, "label")= chr "Educational attainment recode"
##   .. ..- attr(*, "format.stata")= chr "%51.0g"
##   .. ..- attr(*, "labels")= Named num [1:36] 0 1 2 10 11 12 13 14 20 21 ...
##   .. .. ..- attr(*, "names")= chr [1:36] "NIU or no schooling" "NIU or blank" "None or preschool" "Grades 1, 2, 3, or 4" ...
##  $ chareduc   : atomic  4 years of college 12th grade, diploma unclear 12th grade, diploma unclear Grade 10 ...
##   ..- attr(*, "label")= chr "Educational attainment recode"
##   ..- attr(*, "format.stata")= chr "%51s"
##  $ race       : atomic  White White White White ...
##   ..- attr(*, "label")= chr "Race"
##   ..- attr(*, "format.stata")= chr "%38s"
##  $ empstat    :Class 'labelled'  atomic [1:297222] 10 34 10 31 10 12 31 10 34 33 ...
##   .. ..- attr(*, "label")= chr "Employment status"
##   .. ..- attr(*, "format.stata")= chr "%30.0g"
##   .. ..- attr(*, "labels")= Named num [1:14] 0 1 10 12 20 21 22 30 31 32 ...
##   .. .. ..- attr(*, "names")= chr [1:14] "NIU" "Armed Forces" "At work" "Has job, not at work last week" ...
##  $ charempstat: atomic  At work NILF, other At work NILF, housework ...
##   ..- attr(*, "label")= chr "Employment status"
##   ..- attr(*, "format.stata")= chr "%30s"
##  $ employed   : atomic  1 0 1 0 1 1 0 1 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ hourwage   : atomic  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Hourly wage"
##   ..- attr(*, "format.stata")= chr "%4.2f"
##  $ earnweek   : atomic  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Weekly earnings"
##   ..- attr(*, "format.stata")= chr "%8.2f"
##  $ weight     : atomic  0.208 0.167 0.201 0.165 0.205 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"

To see summary of a categorical variable, race

table(race)
## race
##                  American Indian-Asian 
##                                      5 
##           American Indian/Aleut/Eskimo 
##                                    677 
##                             Asian only 
##                                   5906 
##              Asian or Pacific Islander 
##                                   3730 
##        Asian-Hawaiian/Pacific Islander 
##                                      5 
##                  Black-American Indian 
##                                     92 
##                            Black-Asian 
##                                     17 
##        Black-Hawaiian/Pacific Islander 
##                                      5 
##                            Black/Negro 
##                                  30278 
##         Hawaiian/Pacific Islander only 
##                                    148 
##            Other (single) race, n.e.c. 
##                                   1151 
##        Two or three races, unspecified 
##                                      1 
##                                  White 
##                                 254097 
##                  White-American Indian 
##                                    382 
##            White-American Indian-Asian 
##                                      4 
##                            White-Asian 
##                                    148 
##  White-Asian-Hawaiian/Pacific Islander 
##                                      1 
##                            White-Black 
##                                    472 
## White-Black--Hawaiian/Pacific Islander 
##                                      1 
##            White-Black-American Indian 
##                                     79 
##                      White-Black-Asian 
##                                      5 
##        White-Hawaiian/Pacific Islander 
##                                     18

Selecting cases of Pennsylvania/New Jersey w/ selected variables

#Pennsylvania
Penn=subset(cps, state=="Pennsylvania", select=c("educ", "chareduc", "race", "earnweek", "hourwage"))
# New Jersey 
NJ=subset(cps, state=="New Jersey", select=c("educ", "chareduc", "race", "earnweek", "hourwage"))

Bar graph

table(Penn$race)
## 
##                  American Indian-Asian 
##                                      5 
##           American Indian/Aleut/Eskimo 
##                                    285 
##                             Asian only 
##                                   1681 
##              Asian or Pacific Islander 
##                                   1109 
##        Asian-Hawaiian/Pacific Islander 
##                                      2 
##                  Black-American Indian 
##                                     51 
##                            Black-Asian 
##                                      5 
##        Black-Hawaiian/Pacific Islander 
##                                      5 
##                            Black/Negro 
##                                  14662 
##         Hawaiian/Pacific Islander only 
##                                     59 
##            Other (single) race, n.e.c. 
##                                    399 
##        Two or three races, unspecified 
##                                      1 
##                                  White 
##                                 144473 
##                  White-American Indian 
##                                    270 
##            White-American Indian-Asian 
##                                      1 
##                            White-Asian 
##                                     93 
##                            White-Black 
##                                    251 
## White-Black--Hawaiian/Pacific Islander 
##                                      1 
##            White-Black-American Indian 
##                                     58 
##                      White-Black-Asian 
##                                      2 
##        White-Hawaiian/Pacific Islander 
##                                      8
ggplot(Penn) + 
  geom_bar(mapping=aes(x=race, y = ..prop.., group =1))

#For vertical labels
ggplot(Penn) + 
  geom_bar(mapping=aes(x=race, y = ..prop.., group =1)) + 
  theme_bw(base_size = 10) +  #To control the room for the labels at the bottom
  theme(axis.text.x=element_text(angle=45,hjust=1)) #To control the angle of the text

#or to switch the x and y axes
ggplot(Penn) + 
  geom_bar(mapping=aes(x=race, y = ..prop.., group =1)) +
  coord_flip()

#You can also pick some big categories to display
Source_order <- c("White", "Black/Negro", "Asian only", "Asian or Pacific Islander")
ggplot(Penn) + 
  geom_bar(mapping=aes(x=race, y = ..prop.., group =1)) + scale_x_discrete(limits=Source_order) 
## Warning: Removed 1496 rows containing non-finite values (stat_count).

#comparing two states
ggplot(cps, aes(x=race, fill= state)) + geom_bar(position = "dodge") + scale_x_discrete(limits=Source_order) 
## Warning: Removed 3211 rows containing non-finite values (stat_count).

Or you can recode the race variable into fewer categories to facilitate display

cps$race2 <- NA # creat a new variable

Then recode the old categories into the new categories

cps$race2[cps$race=="White-Hawaiian/Pacific Islander"] <- "Mixed"
cps$race2[cps$race=="White-Black-Asian"] <- "Mixed"
cps$race2[cps$race=="White-Black-American Indian"] <- "Mixed"
cps$race2[cps$race=="White-Black--Hawaiian/Pacific Islander"] <- "Mixed"
cps$race2[cps$race=="White-Black"] <- "Mixed"
cps$race2[cps$race=="White-Asian"] <- "Mixed"
cps$race2[cps$race=="White-American Indian-Asian"] <- "Mixed"
cps$race2[cps$race=="White-American Indian"] <- "Mixed"
cps$race2[cps$race=="White"] <- "White"
cps$race2[cps$race=="Two or three races, unspecified"] <- "Mixed"
cps$race2[cps$race=="Other (single) race, n.e.c."] <- "Other"
cps$race2[cps$race=="Hawaiian/Pacific Islander only"] <- "Other"
cps$race2[cps$race=="Black/Negro"] <- "Black"
cps$race2[cps$race=="Black-Hawaiian/Pacific Islander"] <- "Mixed"
cps$race2[cps$race=="Black-Asian"] <- "Mixed"
cps$race2[cps$race=="Black-American Indian"] <- "Mixed"
cps$race2[cps$race=="Asian-Hawaiian/Pacific Islander"] <- "Mixed"
cps$race2[cps$race=="Asian or Pacific Islander"] <- "Mixed"
cps$race2[cps$race=="Asian only"] <- "Asian"
cps$race2[cps$race=="American Indian/Aleut/Eskimo"] <- "Other"
cps$race2[cps$race=="American Indian-Asian"] <- "Mixed"
cps$race2[cps$race=="White-Asian-Hawaiian/Pacific Islander"] <- "Mixed"

check if the new categories are well made

table(cps$race2, exclude=NULL)
## 
##  Asian  Black  Mixed  Other  White 
##   5906  30278   4965   1976 254097

make a boxplot on race2

ggplot(cps) + 
  geom_bar(mapping=aes(x=race2, y = ..prop.., group =1))

Education attainment

table(cps$chareduc)
## 
##                                   1 year of college 
##                                                4172 
##                         12th grade, diploma unclear 
##                                               33148 
##                              12th grade, no diploma 
##                                                2992 
##                                  2 years of college 
##                                                5404 
##                                  3 years of college 
##                                                2054 
##                                  4 years of college 
##                                                9194 
##                                  5 years of college 
##                                                1492 
##                                 6+ years of college 
##                                                4298 
##                Associate's degree, academic program 
##                                                6903 
## Associate's degree, occupational/vocational program 
##                                                6484 
##                                   Bachelor's degree 
##                                               35111 
##                                    Doctorate degree 
##                                                2359 
##                                             Grade 1 
##                                                  62 
##                                            Grade 10 
##                                               14783 
##                                            Grade 11 
##                                               13649 
##                                             Grade 2 
##                                                 134 
##                                             Grade 3 
##                                                 259 
##                                             Grade 4 
##                                                 375 
##                                             Grade 5 
##                                                 572 
##                                             Grade 6 
##                                                1144 
##                                             Grade 7 
##                                                1474 
##                                             Grade 8 
##                                                5661 
##                                             Grade 9 
##                                               11426 
##                                Grades 1, 2, 3, or 4 
##                                                1184 
##                                       Grades 5 or 6 
##                                                2310 
##                                       Grades 7 or 8 
##                                                8181 
##                   High school diploma or equivalent 
##                                               74697 
##                                     Master's degree 
##                                               12683 
##                                        NIU or blank 
##                                                 868 
##                                   None or preschool 
##                                                 961 
##                          Professional school degree 
##                                                2986 
##                          Some college but no degree 
##                                               30202

Recode the education variable into years of education

cps$educ2 <- NA # create a new variable
cps$educ2[cps$chareduc=="Grade 1"] <- 1  #Do not use "" for quantitative values: R reads "1" not as a number but a text. 
cps$educ2[cps$chareduc=="Grade 2"] <- 2

When the recoding done, check if your new variable was made well without missing or error

table(cps$educ2, exclude = NULL)
## 
##      1      2   <NA> 
##     62    134 297026

Now recode the years of education (educ2) into a few simple categories

cps$educ3 <- NA # create a new variable
cps$educ3[cps$educ2<12] <- "Less than high school"

Check if you see any NAs in the new variable

# table(cps, exclude = NULL)