Work Exprience by Sex and Race Among 30 Year-Olds

# Set working directory
# setwd()

new_data <- read.table('wfed540assignment2.dat', sep=' ')
names(new_data) <- c('R0000100','R0536300','R1482600','T6651300','Z9065500','Z9065700')

# Handle missing values
  new_data[new_data == -1] = NA  # Refused 
  new_data[new_data == -2] = NA  # Dont know 
  new_data[new_data == -3] = NA  # Invalid missing 
  new_data[new_data == -4] = NA  # Valid missing 
  new_data[new_data == -5] = NA  # Non-interview 

# If there are values not categorized they will be represented as NA
vallabels = function(data) {
  data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
  data$R0536300 <- factor(data$R0536300, levels=c(1.0,2.0,0.0), labels=c("Male","Female","No Information"))
  data$R1482600 <- factor(data$R1482600, levels=c(1.0,2.0,3.0,4.0), labels=c("Black","Hispanic","Mixed Race (Non-Hispanic)","Non-Black / Non-Hispanic"))
  data$T6651300 <- factor(data$T6651300, levels=c(26.0,27.0,28.0,29.0,30.0,31.0,32.0), labels=c("26","27","28","29","30","31","32"))
  data$Z9065500 <- cut(data$Z9065500, c(0.0,1.0,500.0,1000.0,1500.0,2000.0,2500.0,3000.0,3500.0,4000.0,4500.0,5000.0,9.9999999E7), labels=c("0","1 TO 499","500 TO 999","1000 TO 1499","1500 TO 1999","2000 TO 2499","2500 TO 2999","3000 TO 3499","3500 TO 3999","4000 TO 4499","4500 TO 4999","5000 TO 99999999: 5000+"), right=FALSE)
  data$Z9065700 <- cut(data$Z9065700, c(0.0,1.0,500.0,1000.0,1500.0,2000.0,2500.0,3000.0,3500.0,4000.0,4500.0,5000.0,9.9999999E7), labels=c("0","1 TO 499","500 TO 999","1000 TO 1499","1500 TO 1999","2000 TO 2499","2500 TO 2999","3000 TO 3499","3500 TO 3999","4000 TO 4499","4500 TO 4999","5000 TO 99999999: 5000+"), right=FALSE)
  return(data)
}

varlabels <- c(    "PUBID - YTH ID CODE 1997",
    "KEY!SEX (SYMBOL) 1997",
    "KEY!RACE_ETHNICITY (SYMBOL) 1997",
    "CV_AGE_INT_DATE 2011",
    "CVC_HOURS_WK_TEEN",
    "CVC_HOURS_WK_ADULT"
)
# Use qnames rather than rnums
qnames = function(data) {
  names(data) <- c("PUBID_1997","Gender_1997","Race_1997","Age_2011","Hours_teen","Hours_adult")
  return(data)
}

********************************************************************

# Remove the '#' before the following line to create a data file called "categories" with value labels. 
categories <- vallabels(new_data)
# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
categories <- qnames(categories)
# Produce summaries for the raw (uncategorized) data file
summary(new_data)
##    PUBID_1997    Gender_1997      Race_1997        Age_2011    
##  Min.   :   1   Min.   :1.000   Min.   :1.000   Min.   :26.00  
##  1st Qu.:2249   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:28.00  
##  Median :4502   Median :1.000   Median :4.000   Median :29.00  
##  Mean   :4504   Mean   :1.488   Mean   :2.788   Mean   :28.79  
##  3rd Qu.:6758   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:30.00  
##  Max.   :9022   Max.   :2.000   Max.   :4.000   Max.   :32.00  
##                                                 NA's   :1561   
##    Hours_teen     Hours_adult   
##  Min.   :    0   Min.   :    0  
##  1st Qu.: 1255   1st Qu.: 8190  
##  Median : 2741   Median :16396  
##  Mean   : 3105   Mean   :15595  
##  3rd Qu.: 4470   3rd Qu.:22418  
##  Max.   :18829   Max.   :63722  
##  NA's   :707     NA's   :1596
# Remove the '#' before the following lines to produce summaries for the "categories" data file.
#categories <- vallabels(new_data)
summary(categories)
##         PUBID_1997           Gender_1997  
##  1000 TO 1999: 999   Male          :4599  
##  1 TO 999    : 998   Female        :4385  
##  4000 TO 4999: 998   No Information:   0  
##  2000 TO 2999: 997                        
##  3000 TO 3999: 996                        
##  5000 TO 5999: 996                        
##  (Other)     :3000                        
##                      Race_1997       Age_2011   
##  Black                    :2335   30     :1546  
##  Hispanic                 :1901   29     :1513  
##  Mixed Race (Non-Hispanic):  83   28     :1507  
##  Non-Black / Non-Hispanic :4665   27     :1484  
##                                   31     :1075  
##                                   (Other): 298  
##                                   NA's   :1561  
##                    Hours_teen                    Hours_adult  
##  5000 TO 99999999: 5000+:1614   5000 TO 99999999: 5000+:6063  
##  1000 TO 1499           : 742   0                      : 500  
##  1500 TO 1999           : 709   1 TO 499               : 117  
##  2000 TO 2499           : 667   1000 TO 1499           :  92  
##  500 TO 999             : 666   4500 TO 4999           :  91  
##  (Other)                :3879   (Other)                : 525  
##  NA's                   : 707   NA's                   :1596

********************************************************************

Q1. Use the dplyr command tbl_df to create a table data frame. Then, show a glimpse of the data.

require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
cum_hours_work <- tbl_df(new_data)
glimpse(cum_hours_work)
## Observations: 8,984
## Variables: 6
## $ PUBID_1997  (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Gender_1997 (int) 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2,...
## $ Race_1997   (int) 4, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2,...
## $ Age_2011    (int) 29, 29, 28, 30, 29, 29, 28, 30, 29, NA, 29, 29, 26...
## $ Hours_teen  (int) 5831, NA, 6489, 3292, 680, NA, 1650, 2082, 864, 0,...
## $ Hours_adult (int) NA, 29712, NA, 23390, 28056, 18379, NA, 18419, 192...
summary(cum_hours_work)
##    PUBID_1997    Gender_1997      Race_1997        Age_2011    
##  Min.   :   1   Min.   :1.000   Min.   :1.000   Min.   :26.00  
##  1st Qu.:2249   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:28.00  
##  Median :4502   Median :1.000   Median :4.000   Median :29.00  
##  Mean   :4504   Mean   :1.488   Mean   :2.788   Mean   :28.79  
##  3rd Qu.:6758   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:30.00  
##  Max.   :9022   Max.   :2.000   Max.   :4.000   Max.   :32.00  
##                                                 NA's   :1561   
##    Hours_teen     Hours_adult   
##  Min.   :    0   Min.   :    0  
##  1st Qu.: 1255   1st Qu.: 8190  
##  Median : 2741   Median :16396  
##  Mean   : 3105   Mean   :15595  
##  3rd Qu.: 4470   3rd Qu.:22418  
##  Max.   :18829   Max.   :63722  
##  NA's   :707     NA's   :1596

********************************************************************

Q2. Filter the data to include only people who were 30 years of age at the time of the 2011 interview. Calculate the number of people included.

cum_hours_work %>%
  filter(Age_2011 == 30) %>%
  summarize(Thirty=n())
## Source: local data frame [1 x 1]
## 
##   Thirty
##    (int)
## 1   1546

********************************************************************

Q3. Test the null hypothesis that there is no difference by sex between the mean cumulative hours work from age 14 through age 19.

cum_hours_work <- cum_hours_work %>%
  filter(Age_2011 == 30)
t.test(cum_hours_work$Hours_teen~cum_hours_work$Gender_1997)
## 
##  Welch Two Sample t-test
## 
## data:  cum_hours_work$Hours_teen by cum_hours_work$Gender_1997
## t = 4.423, df = 1339.6, p-value = 1.052e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  306.4014 794.8262
## sample estimates:
## mean in group 1 mean in group 2 
##        3528.906        2978.293
p-value = 1.052e-05 is less than α = .05, so reject the null hypothesis (H0)
There is a statically difference by sex between the mean cumulative hours work from age 14 through age 19, t = 4.423, df = 1339.6, p-value = 1.052e-05, 95% CI, [306.4014, 794.8262]

*********************************************************************

Q4. Test the null hypothesis that there is no difference by sex between the mean cumulative hours work from age 20 and older.

t.test(cum_hours_work$Hours_adult~cum_hours_work$Gender_1997)
## 
##  Welch Two Sample t-test
## 
## data:  cum_hours_work$Hours_adult by cum_hours_work$Gender_1997
## t = 4.6599, df = 1146.6, p-value = 3.535e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1400.975 3438.741
## sample estimates:
## mean in group 1 mean in group 2 
##        20671.98        18252.12
p-value = 3.535e-06 is less than α = .05, so reject the null hypothesis (H0)
There is a statically difference by sex between the mean cumulative hours work from age 20 and older, t = 4.6599, df = 1146.6, p-value = 3.535e-06, 95%CI, [1400.975, 3438.741]

*********************************************************************

Q5. Test the null hypothesis that there is no difference by race/ethnicity between the mean cumulative hours work from age 14 through age 19.

require(magrittr)
## Loading required package: magrittr
cum_hours_work$race_recode <- 0
cum_hours_work$race_recode <- ifelse (cum_hours_work$Race_1997 == 4, 1, cum_hours_work$race_recode)
#cum_hours_work$race_recode

t.test(cum_hours_work$Hours_teen~cum_hours_work$race_recode)
## 
##  Welch Two Sample t-test
## 
## data:  cum_hours_work$Hours_teen by cum_hours_work$race_recode
## t = -6.3593, df = 1412.8, p-value = 2.73e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1027.4485  -543.0118
## sample estimates:
## mean in group 0 mean in group 1 
##        2860.372        3645.603
p-value = 2.73e-10 is less than α = .05, so reject the null hypothesis (H0)
There is a statically difference by race/ethnicity between the mean cumulative hours work from age 14 through age 19, t = -6.3593, df = 1412.8, p-value = 2.73e-10, 95% CI, [-1027.4485, -543.0118]

**********************************************************************

Q6. Test the null hypothesis that there is no difference by race/ethnicity between the mean cumulative hours work from age 20 and older.

t.test(cum_hours_work$Hours_adult~cum_hours_work$race_recode)
## 
##  Welch Two Sample t-test
## 
## data:  cum_hours_work$Hours_adult by cum_hours_work$race_recode
## t = -4.2344, df = 1199.7, p-value = 2.466e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3216.827 -1179.760
## sample estimates:
## mean in group 0 mean in group 1 
##        18321.36        20519.65
p-value = 2.466e-05 is less than α = .05, so reject the null hypothesis (H0)
There is a statically difference by race/ethnicity between the mean cumulative hours work from age 20 and older, t = -4.2344, df = 1199.7, p-value = 2.466e-05, 95% CI, [-3216.827, -1179.760]