#Title: WFED 540 Assignment #2
#Author: Andrew Leigey
#Date: October 16, 2015
#Format: html_document
# Set working directory
# setwd()
getwd()
## [1] "/Users/aleigey/Desktop/WFED540fall15/Assignment2"
new_data <- read.table('Assgn2.dat', sep=' ')
names(new_data) <- c('R0000100','R0536300','R1482600','T6651300','Z9065500','Z9065700')
# Handle missing values
new_data[new_data == -1] = NA # Refused
new_data[new_data == -2] = NA # Dont know
new_data[new_data == -3] = NA # Invalid missing
new_data[new_data == -4] = NA # Valid missing
new_data[new_data == -5] = NA # Non-interview
# If there are values not categorized they will be represented as NA
vallabels = function(data) {
data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
data$R0536300 <- factor(data$R0536300, levels=c(1.0,2.0,0.0), labels=c("Male","Female","No Information"))
data$R1482600 <- factor(data$R1482600, levels=c(1.0,2.0,3.0,4.0), labels=c("Black","Hispanic","Mixed Race (Non-Hispanic)","Non-Black / Non-Hispanic"))
data$T6651300 <- factor(data$T6651300, levels=c(26.0,27.0,28.0,29.0,30.0,31.0,32.0), labels=c("26","27","28","29","30","31","32"))
data$Z9065500 <- cut(data$Z9065500, c(0.0,1.0,500.0,1000.0,1500.0,2000.0,2500.0,3000.0,3500.0,4000.0,4500.0,5000.0,9.9999999E7), labels=c("0","1 TO 499","500 TO 999","1000 TO 1499","1500 TO 1999","2000 TO 2499","2500 TO 2999","3000 TO 3499","3500 TO 3999","4000 TO 4499","4500 TO 4999","5000 TO 99999999: 5000+"), right=FALSE)
data$Z9065700 <- cut(data$Z9065700, c(0.0,1.0,500.0,1000.0,1500.0,2000.0,2500.0,3000.0,3500.0,4000.0,4500.0,5000.0,9.9999999E7), labels=c("0","1 TO 499","500 TO 999","1000 TO 1499","1500 TO 1999","2000 TO 2499","2500 TO 2999","3000 TO 3499","3500 TO 3999","4000 TO 4499","4500 TO 4999","5000 TO 99999999: 5000+"), right=FALSE)
return(data)
}
varlabels <- c( "PUBID - YTH ID CODE 1997",
"KEY!SEX (SYMBOL) 1997",
"KEY!RACE_ETHNICITY (SYMBOL) 1997",
"CV_AGE_INT_DATE 2011",
"CVC_HOURS_WK_TEEN",
"CVC_HOURS_WK_ADULT"
)
# Use qnames rather than rnums
qnames = function(data) {
names(data) <- c("P_1997","sex","race","age","hrswrkd14_19","hrswrkd20")
return(data)
}
#********************************************************************************************************
# Remove the '#' before the following line to create a data file called "categories" with value labels.
categories <- vallabels(new_data)
# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
categories <- qnames(categories)
# Produce summaries for the raw (uncategorized) data file
summary(new_data)
## P_1997 sex race age
## Min. : 1 Min. :1.000 Min. :1.000 Min. :26.00
## 1st Qu.:2249 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:28.00
## Median :4502 Median :1.000 Median :4.000 Median :29.00
## Mean :4504 Mean :1.488 Mean :2.788 Mean :28.79
## 3rd Qu.:6758 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:30.00
## Max. :9022 Max. :2.000 Max. :4.000 Max. :32.00
## NA's :1561
## hrswrkd14_19 hrswrkd20
## Min. : 0 Min. : 0
## 1st Qu.: 1255 1st Qu.: 8190
## Median : 2741 Median :16396
## Mean : 3105 Mean :15595
## 3rd Qu.: 4470 3rd Qu.:22418
## Max. :18829 Max. :63722
## NA's :707 NA's :1596
# Remove the '#' before the following lines to produce summaries for the "categories" data file.
#categories <- vallabels(new_data)
#summary(categories)
#************************************************************************************************************
summary(new_data)
## P_1997 sex race age
## Min. : 1 Min. :1.000 Min. :1.000 Min. :26.00
## 1st Qu.:2249 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:28.00
## Median :4502 Median :1.000 Median :4.000 Median :29.00
## Mean :4504 Mean :1.488 Mean :2.788 Mean :28.79
## 3rd Qu.:6758 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:30.00
## Max. :9022 Max. :2.000 Max. :4.000 Max. :32.00
## NA's :1561
## hrswrkd14_19 hrswrkd20
## Min. : 0 Min. : 0
## 1st Qu.: 1255 1st Qu.: 8190
## Median : 2741 Median :16396
## Mean : 3105 Mean :15595
## 3rd Qu.: 4470 3rd Qu.:22418
## Max. :18829 Max. :63722
## NA's :707 NA's :1596
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##1.
###Download the following variables and associated RScript necessary
###from the NLSY97 data set accessible through https://www.nlsinfo.org/investigator/pages/login.jsp -
###• R05363.00 [KEY!SEX], KEY!SEX, R'S GENDER
###• R14826.00 [KEY!RACE_ETHNICITY], KEY!RACE_ETHNICITY, COMBINED RACE AND ETHNICITY (SYMBOL)
###• T66513.00 [CV_AGE_INT_DATE], R'S AGE AT INTERVIEW DATE (at the time of the 2011 interview)
###• Z90655.00 [CVC_HOURS_WK_TEEN], CUMULATIVE HOURS WORKED FROM AGE 14 THROUGH AGE 19
###• Z90657.00 [CVC_HOURS_WK_ADULT_ALL], CUMULATIVE HOURS R WORKED FROM AGE 20
###Create a project in R with these data. Use the dplyr command tbl_df to create a table data frame.
###Then, show a glimpse of the data. The inclusion of the correct variables in this glimpse is the
###focal point for assessment of successful completion of this item in the assignment.
tbl_df(new_data)
## Source: local data frame [8,984 x 6]
##
## P_1997 sex race age hrswrkd14_19 hrswrkd20
## (int) (int) (int) (int) (int) (int)
## 1 1 2 4 29 5831 NA
## 2 2 1 2 29 NA 29712
## 3 3 2 2 28 6489 NA
## 4 4 2 2 30 3292 23390
## 5 5 1 2 29 680 28056
## 6 6 2 2 29 NA 18379
## 7 7 1 2 28 1650 NA
## 8 8 2 4 30 2082 18419
## 9 9 1 4 29 864 19274
## 10 10 1 4 NA 0 6849
## .. ... ... ... ... ... ...
glimpse(new_data)
## Observations: 8,984
## Variables: 6
## $ P_1997 (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
## $ sex (int) 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2...
## $ race (int) 4, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2...
## $ age (int) 29, 29, 28, 30, 29, 29, 28, 30, 29, NA, 29, 29, 2...
## $ hrswrkd14_19 (int) 5831, NA, 6489, 3292, 680, NA, 1650, 2082, 864, 0...
## $ hrswrkd20 (int) NA, 29712, NA, 23390, 28056, 18379, NA, 18419, 19...
##2.
###Filter the data in your project to include only people who were 30 years of age at the time of
###the 2011 interview. Calculate the number of people included. Perform the remainder of the
###calculations in this assignment using data from these included people 30 years of age in 2011.
new_data30<-filter(new_data,age>29,age<31)
tbl_df(new_data30)
## Source: local data frame [1,546 x 6]
##
## P_1997 sex race age hrswrkd14_19 hrswrkd20
## (int) (int) (int) (int) (int) (int)
## 1 4 2 2 30 3292 23390
## 2 8 2 4 30 2082 18419
## 3 26 1 1 30 760 0
## 4 27 1 1 30 1592 11060
## 5 32 2 4 30 4611 16981
## 6 33 2 4 30 1862 26551
## 7 38 2 4 30 0 NA
## 8 55 2 2 30 1368 16934
## 9 59 2 1 30 5648 NA
## 10 68 1 1 30 1316 NA
## .. ... ... ... ... ... ...
new_data30count<-summarise(new_data30,count=n())
new_data30count
## count
## 1 1546
##3.
###Test the null hypothesis that there is no difference by sex between the
###mean cumulative hours work from age 14 through age 19 with a probability
###of Type 1 error of 0.05 level.
require(magrittr)
## Loading required package: magrittr
require(ggvis)
## Loading required package: ggvis
new_data30%>%filter(hrswrkd14_19>=0)%>%ggvis(~hrswrkd14_19)%>%layer_histograms()
## Guessing width = 500 # range / 29
new_data30%>%filter(hrswrkd14_19>=500, hrswrkd14_19<=8000)%>%ggvis(~hrswrkd14_19)%>%layer_histograms()
## Guessing width = 200 # range / 38
new_data30%>%filter(hrswrkd20>=0)%>%ggvis(~hrswrkd20)%>%layer_histograms()
## Guessing width = 2000 # range / 32
new_data30%>%filter(hrswrkd20>=1000, hrswrkd20<=30000)%>%ggvis(~hrswrkd20)%>%layer_histograms()
## Guessing width = 1000 # range / 29
new_data30fltr<-filter(new_data30,hrswrkd14_19>=500,hrswrkd14_19<=8000,hrswrkd20>=1000,hrswrkd20<=30000)
tbl_df(new_data30fltr)
## Source: local data frame [888 x 6]
##
## P_1997 sex race age hrswrkd14_19 hrswrkd20
## (int) (int) (int) (int) (int) (int)
## 1 4 2 2 30 3292 23390
## 2 8 2 4 30 2082 18419
## 3 27 1 1 30 1592 11060
## 4 32 2 4 30 4611 16981
## 5 33 2 4 30 1862 26551
## 6 55 2 2 30 1368 16934
## 7 80 1 1 30 3120 1891
## 8 83 1 2 30 1910 14857
## 9 102 2 2 30 3488 17674
## 10 104 2 4 30 4468 28206
## .. ... ... ... ... ... ...
new_data30fltr%>%ggvis(~sex)%>%layer_histograms()
## Guessing width = 0.05 # range / 20
new_data30fltr%>%ggvis(~hrswrkd14_19)%>%layer_histograms()
## Guessing width = 200 # range / 38
t.test(hrswrkd14_19~sex,new_data30fltr,var.equal=TRUE)
##
## Two Sample t-test
##
## data: hrswrkd14_19 by sex
## t = 0.083046, df = 886, p-value = 0.9338
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -221.9063 241.5152
## sample estimates:
## mean in group 1 mean in group 2
## 3244.928 3235.123
#Here I failed to reject the null hypothesis that there is
#no difference by sex between the mean cumulative hours worked from
#age 14 through age 19 because the p-value was greater than .05 level.
#t(hrswrkd14_19 by sex)=0.083046, p=0.9338, d=886, 95% CI [-221.9063, 241.5152]
##4.
###Test the null hypothesis that there is no difference by sex between
###the mean cumulative hours work from age 20 and older with a probablility of
###Type 1 error of 0.05 level.
new_data30fltr%>%ggvis(~hrswrkd20)%>%layer_histograms()
## Guessing width = 1000 # range / 29
t.test(hrswrkd20~sex,new_data30fltr,var.equal=TRUE)
##
## Two Sample t-test
##
## data: hrswrkd20 by sex
## t = 2.2493, df = 886, p-value = 0.02474
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 138.0059 2027.6400
## sample estimates:
## mean in group 1 mean in group 2
## 19414.71 18331.89
#Here I reject the null hypothesis that there is
#no difference by sex between the mean cumulative hours worked from
#age 20 and older because the p-value was less than .05 level.
#t(hrswrkd20 by sex)=2.2493, p=0.02474, d=886, 95% CI [138.0059, 2027.6400]
##5.
###Test the null hypothesis that there is no difference by race/ethnicity between the mean cumulative hours
###work from age 14 through age 19 with a probablility of Type 1 error of 0.05 level.
###In this analysis — and in the analysis for item (6) — code race/ethnicity
###as "1" if race/ethnicity is "Non-Black, Non-Hispanic" and "0" otherwise.
new_data30fltr$race<-ifelse(new_data30fltr$race==4,1,0)
t.test(hrswrkd14_19~race,new_data30fltr,var.equal=TRUE)
##
## Two Sample t-test
##
## data: hrswrkd14_19 by race
## t = -4.5734, df = 886, p-value = 5.481e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -760.1347 -303.6296
## sample estimates:
## mean in group 0 mean in group 1
## 2961.043 3492.925
#Here I failed to reject the null hypothesis that there is
#no difference by race between the mean cumulative hours worked from
#age 14 through age 19 because the p-value was greater than .05 level.
#t(hrswrkd14_19 by race)=-4.5734, p=5.481e-06, d=886, 95% CI [-760.1347, -303.6296]
##6.
###Test the null hypothesis that there is no difference by race/ethnicity
###between the mean cumulative hours work from age 20 and older with a
###probablility of Type 1 error of 0.05 level.
t.test(hrswrkd20~race,new_data30fltr,var.equal=TRUE)
##
## Two Sample t-test
##
## data: hrswrkd20 by race
## t = -0.70509, df = 886, p-value = 0.4809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1283.2099 604.8983
## sample estimates:
## mean in group 0 mean in group 1
## 18644.49 18983.65
#Here I failed to reject the null hypothesis that there is
#no difference by race between the mean cumulative hours worked from
#age 20 and older because the p-value was greater than .05 level.
#t(hrswrkd20 by race)=0.70509, p=0.4809, d=886, 95% CI [-1283.2099, 604.8983]
#####References:
####National Longitudinal Survey of Youth 1997 (NLSY1997), Retrieved
####October 8, 2015, from:https://www.nlsinfo.org/investigator/pages/login.jsp