load(file="cps.RData")
library(ggplot2)
attach(cps)
str(cps)
## Classes 'tbl_df', 'tbl' and 'data.frame': 297222 obs. of 13 variables:
## $ year : atomic 1983 1983 1983 1983 1983 ...
## ..- attr(*, "label")= chr "Survey year"
## ..- attr(*, "format.stata")= chr "%8.0g"
## $ state : atomic Pennsylvania Pennsylvania New Jersey New Jersey ...
## ..- attr(*, "label")= chr "State (FIPS code)"
## ..- attr(*, "format.stata")= chr "%12s"
## $ age :Class 'labelled' atomic [1:297222] 40 72 27 88 24 26 65 22 72 16 ...
## .. ..- attr(*, "label")= chr "Age"
## .. ..- attr(*, "format.stata")= chr "%19.0g"
## .. ..- attr(*, "labels")= Named num [1:100] 0 1 2 3 4 5 6 7 8 9 ...
## .. .. ..- attr(*, "names")= chr [1:100] "Under 1 year" "1" "2" "3" ...
## $ gender : atomic Male Male Male Female ...
## ..- attr(*, "label")= chr "Sex"
## ..- attr(*, "format.stata")= chr "%9s"
## $ educ :Class 'labelled' atomic [1:297222] 110 72 72 50 110 73 110 80 72 40 ...
## .. ..- attr(*, "label")= chr "Educational attainment recode"
## .. ..- attr(*, "format.stata")= chr "%51.0g"
## .. ..- attr(*, "labels")= Named num [1:36] 0 1 2 10 11 12 13 14 20 21 ...
## .. .. ..- attr(*, "names")= chr [1:36] "NIU or no schooling" "NIU or blank" "None or preschool" "Grades 1, 2, 3, or 4" ...
## $ chareduc : atomic 4 years of college 12th grade, diploma unclear 12th grade, diploma unclear Grade 10 ...
## ..- attr(*, "label")= chr "Educational attainment recode"
## ..- attr(*, "format.stata")= chr "%51s"
## $ race : atomic White White White White ...
## ..- attr(*, "label")= chr "Race"
## ..- attr(*, "format.stata")= chr "%38s"
## $ empstat :Class 'labelled' atomic [1:297222] 10 34 10 31 10 12 31 10 34 33 ...
## .. ..- attr(*, "label")= chr "Employment status"
## .. ..- attr(*, "format.stata")= chr "%30.0g"
## .. ..- attr(*, "labels")= Named num [1:14] 0 1 10 12 20 21 22 30 31 32 ...
## .. .. ..- attr(*, "names")= chr [1:14] "NIU" "Armed Forces" "At work" "Has job, not at work last week" ...
## $ charempstat: atomic At work NILF, other At work NILF, housework ...
## ..- attr(*, "label")= chr "Employment status"
## ..- attr(*, "format.stata")= chr "%30s"
## $ employed : atomic 1 0 1 0 1 1 0 1 0 0 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ hourwage : atomic NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "label")= chr "Hourly wage"
## ..- attr(*, "format.stata")= chr "%4.2f"
## $ earnweek : atomic NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "label")= chr "Weekly earnings"
## ..- attr(*, "format.stata")= chr "%8.2f"
## $ weight : atomic 0.208 0.167 0.201 0.165 0.205 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
table(race)
## race
## American Indian-Asian
## 5
## American Indian/Aleut/Eskimo
## 677
## Asian only
## 5906
## Asian or Pacific Islander
## 3730
## Asian-Hawaiian/Pacific Islander
## 5
## Black-American Indian
## 92
## Black-Asian
## 17
## Black-Hawaiian/Pacific Islander
## 5
## Black/Negro
## 30278
## Hawaiian/Pacific Islander only
## 148
## Other (single) race, n.e.c.
## 1151
## Two or three races, unspecified
## 1
## White
## 254097
## White-American Indian
## 382
## White-American Indian-Asian
## 4
## White-Asian
## 148
## White-Asian-Hawaiian/Pacific Islander
## 1
## White-Black
## 472
## White-Black--Hawaiian/Pacific Islander
## 1
## White-Black-American Indian
## 79
## White-Black-Asian
## 5
## White-Hawaiian/Pacific Islander
## 18
#Pennsylvania
Penn=subset(cps, state=="Pennsylvania", select=c("educ", "chareduc", "race", "earnweek", "hourwage"))
# New Jersey
NJ=subset(cps, state=="New Jersey", select=c("educ", "chareduc", "race", "earnweek", "hourwage"))
table(Penn$race)
##
## American Indian-Asian
## 5
## American Indian/Aleut/Eskimo
## 285
## Asian only
## 1681
## Asian or Pacific Islander
## 1109
## Asian-Hawaiian/Pacific Islander
## 2
## Black-American Indian
## 51
## Black-Asian
## 5
## Black-Hawaiian/Pacific Islander
## 5
## Black/Negro
## 14662
## Hawaiian/Pacific Islander only
## 59
## Other (single) race, n.e.c.
## 399
## Two or three races, unspecified
## 1
## White
## 144473
## White-American Indian
## 270
## White-American Indian-Asian
## 1
## White-Asian
## 93
## White-Black
## 251
## White-Black--Hawaiian/Pacific Islander
## 1
## White-Black-American Indian
## 58
## White-Black-Asian
## 2
## White-Hawaiian/Pacific Islander
## 8
ggplot(Penn) +
geom_bar(mapping=aes(x=race, y = ..prop.., group =1))
#For vertical labels
ggplot(Penn) +
geom_bar(mapping=aes(x=race, y = ..prop.., group =1)) +
theme_bw(base_size = 10) + #To control the room for the labels at the bottom
theme(axis.text.x=element_text(angle=45,hjust=1)) #To control the angle of the text
#or to switch the x and y axes
ggplot(Penn) +
geom_bar(mapping=aes(x=race, y = ..prop.., group =1)) +
coord_flip()
#You can also pick some big categories to display
Source_order <- c("White", "Black/Negro", "Asian only", "Asian or Pacific Islander")
ggplot(Penn) +
geom_bar(mapping=aes(x=race, y = ..prop.., group =1)) + scale_x_discrete(limits=Source_order)
## Warning: Removed 1496 rows containing non-finite values (stat_count).
#comparing two states
ggplot(cps, aes(x=race, fill= state)) + geom_bar(position = "dodge") + scale_x_discrete(limits=Source_order)
## Warning: Removed 3211 rows containing non-finite values (stat_count).
Or you can recode the race variable into fewer categories to facilitate display
cps$race2 <- NA # creat a new variable
Then recode the old categories into the new categories
cps$race2[cps$race=="White-Hawaiian/Pacific Islander"] <- "Mixed"
cps$race2[cps$race=="White-Black-Asian"] <- "Mixed"
cps$race2[cps$race=="White-Black-American Indian"] <- "Mixed"
cps$race2[cps$race=="White-Black--Hawaiian/Pacific Islander"] <- "Mixed"
cps$race2[cps$race=="White-Black"] <- "Mixed"
cps$race2[cps$race=="White-Asian"] <- "Mixed"
cps$race2[cps$race=="White-American Indian-Asian"] <- "Mixed"
cps$race2[cps$race=="White-American Indian"] <- "Mixed"
cps$race2[cps$race=="White"] <- "White"
cps$race2[cps$race=="Two or three races, unspecified"] <- "Mixed"
cps$race2[cps$race=="Other (single) race, n.e.c."] <- "Other"
cps$race2[cps$race=="Hawaiian/Pacific Islander only"] <- "Other"
cps$race2[cps$race=="Black/Negro"] <- "Black"
cps$race2[cps$race=="Black-Hawaiian/Pacific Islander"] <- "Mixed"
cps$race2[cps$race=="Black-Asian"] <- "Mixed"
cps$race2[cps$race=="Black-American Indian"] <- "Mixed"
cps$race2[cps$race=="Asian-Hawaiian/Pacific Islander"] <- "Mixed"
cps$race2[cps$race=="Asian or Pacific Islander"] <- "Mixed"
cps$race2[cps$race=="Asian only"] <- "Asian"
cps$race2[cps$race=="American Indian/Aleut/Eskimo"] <- "Other"
cps$race2[cps$race=="American Indian-Asian"] <- "Mixed"
cps$race2[cps$race=="White-Asian-Hawaiian/Pacific Islander"] <- "Mixed"
check if the new categories are well made
table(cps$race2, exclude=NULL)
##
## Asian Black Mixed Other White
## 5906 30278 4965 1976 254097
make a boxplot on race2
ggplot(cps) +
geom_bar(mapping=aes(x=race2, y = ..prop.., group =1))
table(cps$chareduc)
##
## 1 year of college
## 4172
## 12th grade, diploma unclear
## 33148
## 12th grade, no diploma
## 2992
## 2 years of college
## 5404
## 3 years of college
## 2054
## 4 years of college
## 9194
## 5 years of college
## 1492
## 6+ years of college
## 4298
## Associate's degree, academic program
## 6903
## Associate's degree, occupational/vocational program
## 6484
## Bachelor's degree
## 35111
## Doctorate degree
## 2359
## Grade 1
## 62
## Grade 10
## 14783
## Grade 11
## 13649
## Grade 2
## 134
## Grade 3
## 259
## Grade 4
## 375
## Grade 5
## 572
## Grade 6
## 1144
## Grade 7
## 1474
## Grade 8
## 5661
## Grade 9
## 11426
## Grades 1, 2, 3, or 4
## 1184
## Grades 5 or 6
## 2310
## Grades 7 or 8
## 8181
## High school diploma or equivalent
## 74697
## Master's degree
## 12683
## NIU or blank
## 868
## None or preschool
## 961
## Professional school degree
## 2986
## Some college but no degree
## 30202
Recode the education variable into years of education
cps$educ2 <- NA # create a new variable
cps$educ2[cps$chareduc=="Grade 1"] <- 1 #Do not use "" for quantitative values: R reads "1" not as a number but a text.
cps$educ2[cps$chareduc=="Grade 2"] <- 2
When the recoding done, check if your new variable was made well without missing or error
table(cps$educ2, exclude = NULL)
##
## 1 2 <NA>
## 62 134 297026
Now recode the years of education (educ2) into a few simple categories
cps$educ3 <- NA # create a new variable
cps$educ3[cps$educ2<12] <- "Less than high school"
Check if you see any NAs in the new variable
# table(cps, exclude = NULL)