1 Load IPUMS Data

# NOTE: To load data, you must download both the extract's data and the DDI
# and also set the working directory to the folder with these files (or change the path below).

if (!require("ipumsr")) stop("Reading IPUMS data into R requires the ipumsr package. It can be installed using the following command: install.packages('ipumsr')")
## Loading required package: ipumsr
# install.packages('ipumsr')
library("ipumsr")

remove(list = ls())

ddi <- read_ipums_ddi("cps_00043.xml")
data <- read_ipums_micro(ddi)
## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.

2 EDA

2.1 Create the required chart

library("ggplot2")

ggplot(data = data,
       mapping = aes(x = LABFORCE,y = INCWAGE))+geom_point()
## Warning: Removed 2602318 rows containing missing values (`geom_point()`).

# 99999999 = N.I.U. (Not in Universe)
# 99999998 = Missing (1962-1966 only)


df <- data # duplicate data

df <- df[c(1,3,12:16)]
df$inc <- df$INCWAGE


library("psych")
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
describe(df$inc)
##    vars      n     mean       sd median  trimmed     mad min   max range skew
## X1    1 474234 20945796 40657777  33000 20945796 48925.8   0 1e+08 1e+08 1.43
##    kurtosis       se
## X1     0.05 59040.13
df$inc [ df$inc == 99999999] <- NA
df$inc [ df$inc == 99999998] <- NA
describe(df$inc)
##    vars      n    mean       sd median trimmed   mad min     max   range skew
## X1    1 375035 35487.1 66514.04  15000 35487.1 22239   0 2099999 2099999 7.67
##    kurtosis     se
## X1    103.9 108.61
ipums_val_labels(df$LABFORCE)
## # A tibble: 3 × 2
##     val lbl                       
##   <int> <chr>                     
## 1     0 NIU                       
## 2     1 No, not in the labor force
## 3     2 Yes, in the labor force
ggplot(data = df,
       mapping = aes(x = LABFORCE,y = inc)
       ) + geom_point() + scale_x_discrete(labels=c("0" = "Not in Universe", "1" = "No, not in the Labor Force", "2" = "Yes, In the Labor Force"))  
## Warning: Removed 2701517 rows containing missing values (`geom_point()`).

ggplot(data = df,
       mapping = aes(x = LABFORCE,y = inc)
       ) + geom_point() + scale_x_discrete(breaks=c("0","1","2"),
        labels=c("NIU", "NILF", "ILF"))
## Warning: Removed 2701517 rows containing missing values (`geom_point()`).

df$labforce <- as.character(df$LABFORCE) 

df$labforce[df$labforce=="0"] <- "NIU"
df$labforce[df$labforce=="1"] <- "NILF"
df$labforce[df$labforce=="2"] <- "ILF"

ggplot(data = df,
       mapping = aes(x = labforce,y = inc))+geom_point()+labs(title="Income Wage by Labor Force Status \n Current Population Survey",
        x ="Labor Force Status", y = "Personal Income")
## Warning: Removed 2701517 rows containing missing values (`geom_point()`).