# Set working directory
# setwd()

getwd()
## [1] "/Users/aleigey/Desktop/WFED540fall15/Assignment2"
new_data <- read.table('Assgn2.dat', sep=' ')
names(new_data) <- c('R0000100','R0536300','R1482600','T6651300','Z9065500','Z9065700')

# Handle missing values
  new_data[new_data == -1] = NA  # Refused 
  new_data[new_data == -2] = NA  # Dont know 
  new_data[new_data == -3] = NA  # Invalid missing 
  new_data[new_data == -4] = NA  # Valid missing 
  new_data[new_data == -5] = NA  # Non-interview 

# If there are values not categorized they will be represented as NA
vallabels = function(data) {
  data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
  data$R0536300 <- factor(data$R0536300, levels=c(1.0,2.0,0.0), labels=c("Male","Female","No Information"))
  data$R1482600 <- factor(data$R1482600, levels=c(1.0,2.0,3.0,4.0), labels=c("Black","Hispanic","Mixed Race (Non-Hispanic)","Non-Black / Non-Hispanic"))
  data$T6651300 <- factor(data$T6651300, levels=c(26.0,27.0,28.0,29.0,30.0,31.0,32.0), labels=c("26","27","28","29","30","31","32"))
  data$Z9065500 <- cut(data$Z9065500, c(0.0,1.0,500.0,1000.0,1500.0,2000.0,2500.0,3000.0,3500.0,4000.0,4500.0,5000.0,9.9999999E7), labels=c("0","1 TO 499","500 TO 999","1000 TO 1499","1500 TO 1999","2000 TO 2499","2500 TO 2999","3000 TO 3499","3500 TO 3999","4000 TO 4499","4500 TO 4999","5000 TO 99999999: 5000+"), right=FALSE)
  data$Z9065700 <- cut(data$Z9065700, c(0.0,1.0,500.0,1000.0,1500.0,2000.0,2500.0,3000.0,3500.0,4000.0,4500.0,5000.0,9.9999999E7), labels=c("0","1 TO 499","500 TO 999","1000 TO 1499","1500 TO 1999","2000 TO 2499","2500 TO 2999","3000 TO 3499","3500 TO 3999","4000 TO 4499","4500 TO 4999","5000 TO 99999999: 5000+"), right=FALSE)
  return(data)
}

varlabels <- c(    "PUBID - YTH ID CODE 1997",
    "KEY!SEX (SYMBOL) 1997",
    "KEY!RACE_ETHNICITY (SYMBOL) 1997",
    "CV_AGE_INT_DATE 2011",
    "CVC_HOURS_WK_TEEN",
    "CVC_HOURS_WK_ADULT"
)

# Use qnames rather than rnums
qnames = function(data) {
  names(data) <- c("P_1997","sex","race","age","hrswrkd14_19","hrswrkd20")
  return(data)
}



#********************************************************************************************************

# Remove the '#' before the following line to create a data file called "categories" with value labels. 
categories <- vallabels(new_data)

# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
categories <- qnames(categories)

# Produce summaries for the raw (uncategorized) data file
summary(new_data)
##      P_1997          sex             race            age       
##  Min.   :   1   Min.   :1.000   Min.   :1.000   Min.   :26.00  
##  1st Qu.:2249   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:28.00  
##  Median :4502   Median :1.000   Median :4.000   Median :29.00  
##  Mean   :4504   Mean   :1.488   Mean   :2.788   Mean   :28.79  
##  3rd Qu.:6758   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:30.00  
##  Max.   :9022   Max.   :2.000   Max.   :4.000   Max.   :32.00  
##                                                 NA's   :1561   
##   hrswrkd14_19     hrswrkd20    
##  Min.   :    0   Min.   :    0  
##  1st Qu.: 1255   1st Qu.: 8190  
##  Median : 2741   Median :16396  
##  Mean   : 3105   Mean   :15595  
##  3rd Qu.: 4470   3rd Qu.:22418  
##  Max.   :18829   Max.   :63722  
##  NA's   :707     NA's   :1596
# Remove the '#' before the following lines to produce summaries for the "categories" data file.
#categories <- vallabels(new_data)
#summary(categories)

#************************************************************************************************************

summary(new_data)
##      P_1997          sex             race            age       
##  Min.   :   1   Min.   :1.000   Min.   :1.000   Min.   :26.00  
##  1st Qu.:2249   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:28.00  
##  Median :4502   Median :1.000   Median :4.000   Median :29.00  
##  Mean   :4504   Mean   :1.488   Mean   :2.788   Mean   :28.79  
##  3rd Qu.:6758   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:30.00  
##  Max.   :9022   Max.   :2.000   Max.   :4.000   Max.   :32.00  
##                                                 NA's   :1561   
##   hrswrkd14_19     hrswrkd20    
##  Min.   :    0   Min.   :    0  
##  1st Qu.: 1255   1st Qu.: 8190  
##  Median : 2741   Median :16396  
##  Mean   : 3105   Mean   :15595  
##  3rd Qu.: 4470   3rd Qu.:22418  
##  Max.   :18829   Max.   :63722  
##  NA's   :707     NA's   :1596
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
##1.
###Download the following variables and associated RScript necessary 
###from the NLSY97 data set accessible through https://www.nlsinfo.org/investigator/pages/login.jsp -
###• R05363.00 [KEY!SEX], KEY!SEX, R'S GENDER
###• R14826.00 [KEY!RACE_ETHNICITY], KEY!RACE_ETHNICITY, COMBINED RACE AND ETHNICITY (SYMBOL)
###• T66513.00 [CV_AGE_INT_DATE], R'S AGE AT INTERVIEW DATE (at the time of the 2011 interview)
###• Z90655.00 [CVC_HOURS_WK_TEEN], CUMULATIVE HOURS WORKED FROM AGE 14 THROUGH AGE 19
###• Z90657.00 [CVC_HOURS_WK_ADULT_ALL], CUMULATIVE HOURS R WORKED FROM AGE 20
###Create a project in R with these data. Use the dplyr command tbl_df to create a table data frame. 
###Then, show a glimpse of the data. The inclusion of the correct variables in this glimpse is the 
###focal point for assessment of successful completion of this item in the assignment.


tbl_df(new_data)
## Source: local data frame [8,984 x 6]
## 
##    P_1997   sex  race   age hrswrkd14_19 hrswrkd20
##     (int) (int) (int) (int)        (int)     (int)
## 1       1     2     4    29         5831        NA
## 2       2     1     2    29           NA     29712
## 3       3     2     2    28         6489        NA
## 4       4     2     2    30         3292     23390
## 5       5     1     2    29          680     28056
## 6       6     2     2    29           NA     18379
## 7       7     1     2    28         1650        NA
## 8       8     2     4    30         2082     18419
## 9       9     1     4    29          864     19274
## 10     10     1     4    NA            0      6849
## ..    ...   ...   ...   ...          ...       ...
glimpse(new_data)
## Observations: 8,984
## Variables: 6
## $ P_1997       (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
## $ sex          (int) 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2...
## $ race         (int) 4, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2...
## $ age          (int) 29, 29, 28, 30, 29, 29, 28, 30, 29, NA, 29, 29, 2...
## $ hrswrkd14_19 (int) 5831, NA, 6489, 3292, 680, NA, 1650, 2082, 864, 0...
## $ hrswrkd20    (int) NA, 29712, NA, 23390, 28056, 18379, NA, 18419, 19...
##2.
###Filter the data in your project to include only people who were 30 years of age at the time of 
###the 2011 interview. Calculate the number of people included. Perform the remainder of the 
###calculations in this assignment using data from these included people 30 years of age in 2011.

new_data30<-filter(new_data,age>29,age<31)

tbl_df(new_data30)
## Source: local data frame [1,546 x 6]
## 
##    P_1997   sex  race   age hrswrkd14_19 hrswrkd20
##     (int) (int) (int) (int)        (int)     (int)
## 1       4     2     2    30         3292     23390
## 2       8     2     4    30         2082     18419
## 3      26     1     1    30          760         0
## 4      27     1     1    30         1592     11060
## 5      32     2     4    30         4611     16981
## 6      33     2     4    30         1862     26551
## 7      38     2     4    30            0        NA
## 8      55     2     2    30         1368     16934
## 9      59     2     1    30         5648        NA
## 10     68     1     1    30         1316        NA
## ..    ...   ...   ...   ...          ...       ...
new_data30count<-summarise(new_data30,count=n())

new_data30count
##   count
## 1  1546
##3.
###Test the null hypothesis that there is no difference by sex between the 
###mean cumulative hours work from age 14 through age 19 with a probability 
###of Type 1 error of 0.05 level.

require(magrittr)
## Loading required package: magrittr
require(ggvis)
## Loading required package: ggvis
new_data30%>%filter(hrswrkd14_19>=0)%>%ggvis(~hrswrkd14_19)%>%layer_histograms()
## Guessing width = 500 # range / 29

new_data30%>%filter(hrswrkd14_19>=500, hrswrkd14_19<=8000)%>%ggvis(~hrswrkd14_19)%>%layer_histograms()
## Guessing width = 200 # range / 38

new_data30%>%filter(hrswrkd20>=0)%>%ggvis(~hrswrkd20)%>%layer_histograms()
## Guessing width = 2000 # range / 32

new_data30%>%filter(hrswrkd20>=1000, hrswrkd20<=30000)%>%ggvis(~hrswrkd20)%>%layer_histograms()
## Guessing width = 1000 # range / 29

new_data30fltr<-filter(new_data30,hrswrkd14_19>=500,hrswrkd14_19<=8000,hrswrkd20>=1000,hrswrkd20<=30000)

tbl_df(new_data30fltr)
## Source: local data frame [888 x 6]
## 
##    P_1997   sex  race   age hrswrkd14_19 hrswrkd20
##     (int) (int) (int) (int)        (int)     (int)
## 1       4     2     2    30         3292     23390
## 2       8     2     4    30         2082     18419
## 3      27     1     1    30         1592     11060
## 4      32     2     4    30         4611     16981
## 5      33     2     4    30         1862     26551
## 6      55     2     2    30         1368     16934
## 7      80     1     1    30         3120      1891
## 8      83     1     2    30         1910     14857
## 9     102     2     2    30         3488     17674
## 10    104     2     4    30         4468     28206
## ..    ...   ...   ...   ...          ...       ...
new_data30fltr%>%ggvis(~sex)%>%layer_histograms()
## Guessing width = 0.05 # range / 20

new_data30fltr%>%ggvis(~hrswrkd14_19)%>%layer_histograms()
## Guessing width = 200 # range / 38

t.test(hrswrkd14_19~sex,new_data30fltr,var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  hrswrkd14_19 by sex
## t = 0.083046, df = 886, p-value = 0.9338
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -221.9063  241.5152
## sample estimates:
## mean in group 1 mean in group 2 
##        3244.928        3235.123
#Here I failed to reject the null hypothesis that there is
#no difference by sex between the mean cumulative hours worked from
#age 14 through age 19 because the p-value was greater than .05 level.
#t(hrswrkd14_19 by sex)=0.083046, p=0.9338, d=886, 95% CI [-221.9063, 241.5152]


##4.
###Test the null hypothesis that there is no difference by sex between 
###the mean cumulative hours work from age 20 and older with a probablility of
###Type 1 error of 0.05 level.

new_data30fltr%>%ggvis(~hrswrkd20)%>%layer_histograms()
## Guessing width = 1000 # range / 29

t.test(hrswrkd20~sex,new_data30fltr,var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  hrswrkd20 by sex
## t = 2.2493, df = 886, p-value = 0.02474
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   138.0059 2027.6400
## sample estimates:
## mean in group 1 mean in group 2 
##        19414.71        18331.89
#Here I reject the null hypothesis that there is
#no difference by sex between the mean cumulative hours worked from
#age 20 and older because the p-value was less than .05 level.
#t(hrswrkd20 by sex)=2.2493, p=0.02474, d=886, 95% CI [138.0059, 2027.6400]


##5.
###Test the null hypothesis that there is no difference by race/ethnicity between the mean cumulative hours
###work from age 14 through age 19 with a probablility of Type 1 error of 0.05 level. 
###In this analysis — and in the analysis for item (6) — code race/ethnicity 
###as "1" if race/ethnicity is "Non-Black, Non-Hispanic" and "0" otherwise.

new_data30fltr$race<-ifelse(new_data30fltr$race==4,1,0)

t.test(hrswrkd14_19~race,new_data30fltr,var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  hrswrkd14_19 by race
## t = -4.5734, df = 886, p-value = 5.481e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -760.1347 -303.6296
## sample estimates:
## mean in group 0 mean in group 1 
##        2961.043        3492.925
#Here I failed to reject the null hypothesis that there is
#no difference by race between the mean cumulative hours worked from
#age 14 through age 19 because the p-value was greater than .05 level.
#t(hrswrkd14_19 by race)=-4.5734, p=5.481e-06, d=886, 95% CI [-760.1347, -303.6296]


##6.
###Test the null hypothesis that there is no difference by race/ethnicity
###between the mean cumulative hours work from age 20 and older with a 
###probablility of Type 1 error of 0.05 level.

t.test(hrswrkd20~race,new_data30fltr,var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  hrswrkd20 by race
## t = -0.70509, df = 886, p-value = 0.4809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1283.2099   604.8983
## sample estimates:
## mean in group 0 mean in group 1 
##        18644.49        18983.65
#Here I failed to reject the null hypothesis that there is
#no difference by race between the mean cumulative hours worked from
#age 20 and older because the p-value was greater than .05 level.
#t(hrswrkd20 by race)=0.70509, p=0.4809, d=886, 95% CI [-1283.2099, 604.8983]



#####References:

####National Longitudinal Survey of Youth 1997 (NLSY1997), Retrieved
####October 8, 2015, from:https://www.nlsinfo.org/investigator/pages/login.jsp