Work Exprience by Sex and Race Among 30 Year-Olds
# Set working directory
# setwd()
new_data <- read.table('wfed540assignment2.dat', sep=' ')
names(new_data) <- c('R0000100','R0536300','R1482600','T6651300','Z9065500','Z9065700')
# Handle missing values
new_data[new_data == -1] = NA # Refused
new_data[new_data == -2] = NA # Dont know
new_data[new_data == -3] = NA # Invalid missing
new_data[new_data == -4] = NA # Valid missing
new_data[new_data == -5] = NA # Non-interview
# If there are values not categorized they will be represented as NA
vallabels = function(data) {
data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
data$R0536300 <- factor(data$R0536300, levels=c(1.0,2.0,0.0), labels=c("Male","Female","No Information"))
data$R1482600 <- factor(data$R1482600, levels=c(1.0,2.0,3.0,4.0), labels=c("Black","Hispanic","Mixed Race (Non-Hispanic)","Non-Black / Non-Hispanic"))
data$T6651300 <- factor(data$T6651300, levels=c(26.0,27.0,28.0,29.0,30.0,31.0,32.0), labels=c("26","27","28","29","30","31","32"))
data$Z9065500 <- cut(data$Z9065500, c(0.0,1.0,500.0,1000.0,1500.0,2000.0,2500.0,3000.0,3500.0,4000.0,4500.0,5000.0,9.9999999E7), labels=c("0","1 TO 499","500 TO 999","1000 TO 1499","1500 TO 1999","2000 TO 2499","2500 TO 2999","3000 TO 3499","3500 TO 3999","4000 TO 4499","4500 TO 4999","5000 TO 99999999: 5000+"), right=FALSE)
data$Z9065700 <- cut(data$Z9065700, c(0.0,1.0,500.0,1000.0,1500.0,2000.0,2500.0,3000.0,3500.0,4000.0,4500.0,5000.0,9.9999999E7), labels=c("0","1 TO 499","500 TO 999","1000 TO 1499","1500 TO 1999","2000 TO 2499","2500 TO 2999","3000 TO 3499","3500 TO 3999","4000 TO 4499","4500 TO 4999","5000 TO 99999999: 5000+"), right=FALSE)
return(data)
}
varlabels <- c( "PUBID - YTH ID CODE 1997",
"KEY!SEX (SYMBOL) 1997",
"KEY!RACE_ETHNICITY (SYMBOL) 1997",
"CV_AGE_INT_DATE 2011",
"CVC_HOURS_WK_TEEN",
"CVC_HOURS_WK_ADULT"
)
# Use qnames rather than rnums
qnames = function(data) {
names(data) <- c("PUBID_1997","Gender_1997","Race_1997","Age_2011","Hours_teen","Hours_adult")
return(data)
}
********************************************************************
# Remove the '#' before the following line to create a data file called "categories" with value labels.
categories <- vallabels(new_data)
# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
categories <- qnames(categories)
# Produce summaries for the raw (uncategorized) data file
summary(new_data)
## PUBID_1997 Gender_1997 Race_1997 Age_2011
## Min. : 1 Min. :1.000 Min. :1.000 Min. :26.00
## 1st Qu.:2249 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:28.00
## Median :4502 Median :1.000 Median :4.000 Median :29.00
## Mean :4504 Mean :1.488 Mean :2.788 Mean :28.79
## 3rd Qu.:6758 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:30.00
## Max. :9022 Max. :2.000 Max. :4.000 Max. :32.00
## NA's :1561
## Hours_teen Hours_adult
## Min. : 0 Min. : 0
## 1st Qu.: 1255 1st Qu.: 8190
## Median : 2741 Median :16396
## Mean : 3105 Mean :15595
## 3rd Qu.: 4470 3rd Qu.:22418
## Max. :18829 Max. :63722
## NA's :707 NA's :1596
# Remove the '#' before the following lines to produce summaries for the "categories" data file.
#categories <- vallabels(new_data)
summary(categories)
## PUBID_1997 Gender_1997
## 1000 TO 1999: 999 Male :4599
## 1 TO 999 : 998 Female :4385
## 4000 TO 4999: 998 No Information: 0
## 2000 TO 2999: 997
## 3000 TO 3999: 996
## 5000 TO 5999: 996
## (Other) :3000
## Race_1997 Age_2011
## Black :2335 30 :1546
## Hispanic :1901 29 :1513
## Mixed Race (Non-Hispanic): 83 28 :1507
## Non-Black / Non-Hispanic :4665 27 :1484
## 31 :1075
## (Other): 298
## NA's :1561
## Hours_teen Hours_adult
## 5000 TO 99999999: 5000+:1614 5000 TO 99999999: 5000+:6063
## 1000 TO 1499 : 742 0 : 500
## 1500 TO 1999 : 709 1 TO 499 : 117
## 2000 TO 2499 : 667 1000 TO 1499 : 92
## 500 TO 999 : 666 4500 TO 4999 : 91
## (Other) :3879 (Other) : 525
## NA's : 707 NA's :1596
********************************************************************
Q1. Use the dplyr command tbl_df to create a table data frame. Then, show a glimpse of the data.
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
cum_hours_work <- tbl_df(new_data)
glimpse(cum_hours_work)
## Observations: 8,984
## Variables: 6
## $ PUBID_1997 (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Gender_1997 (int) 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2,...
## $ Race_1997 (int) 4, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2,...
## $ Age_2011 (int) 29, 29, 28, 30, 29, 29, 28, 30, 29, NA, 29, 29, 26...
## $ Hours_teen (int) 5831, NA, 6489, 3292, 680, NA, 1650, 2082, 864, 0,...
## $ Hours_adult (int) NA, 29712, NA, 23390, 28056, 18379, NA, 18419, 192...
summary(cum_hours_work)
## PUBID_1997 Gender_1997 Race_1997 Age_2011
## Min. : 1 Min. :1.000 Min. :1.000 Min. :26.00
## 1st Qu.:2249 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:28.00
## Median :4502 Median :1.000 Median :4.000 Median :29.00
## Mean :4504 Mean :1.488 Mean :2.788 Mean :28.79
## 3rd Qu.:6758 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:30.00
## Max. :9022 Max. :2.000 Max. :4.000 Max. :32.00
## NA's :1561
## Hours_teen Hours_adult
## Min. : 0 Min. : 0
## 1st Qu.: 1255 1st Qu.: 8190
## Median : 2741 Median :16396
## Mean : 3105 Mean :15595
## 3rd Qu.: 4470 3rd Qu.:22418
## Max. :18829 Max. :63722
## NA's :707 NA's :1596