#getwd()

new_data <- read.table('class11.dat', sep=' ')
names(new_data) <- c('R0000100','R0536300','R0536401','R0536402','R1235800','R1482600','R9700300')

# Handle missing values
  new_data[new_data == -1] = NA  # Refused 
  new_data[new_data == -2] = NA  # Dont know 
  new_data[new_data == -3] = NA  # Invalid missing 
  new_data[new_data == -4] = NA  # Valid missing 
  new_data[new_data == -5] = NA  # Non-interview 

# If there are values not categorized they will be represented as NA
vallabels = function(data) {
  data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
  data$R0536300 <- factor(data$R0536300, levels=c(1.0,2.0,0.0), labels=c("Male","Female","No Information"))
  data$R0536401 <- factor(data$R0536401, levels=c(1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0), labels=c("1: January","2: February","3: March","4: April","5: May","6: June","7: July","8: August","9: September","10: October","11: November","12: December"))
  data$R1235800 <- factor(data$R1235800, levels=c(1.0,0.0), labels=c("Cross-sectional","Oversample"))
  data$R1482600 <- factor(data$R1482600, levels=c(1.0,2.0,3.0,4.0), labels=c("Black","Hispanic","Mixed Race (Non-Hispanic)","Non-Black / Non-Hispanic"))
  data$R9700300 <- factor(data$R9700300, levels=c(1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0), labels=c("Currently in High School","High School Senior","Less than a High School Diploma","Credential Near Completion","Credential Near Completion","Occupational program certificate","Correspondence school diploma","Home Study Diploma","Adult education Diploma","High school certificate of attendance","Completed one semester of college","High School Diploma","Associate degree","Professional Nursing Degree","Bachelor's degree","First professional degree","Master's degree","Post Master's degree","Doctorate Degree","None of the Above"))
  return(data)
}

varlabels <- c(    "PUBID - YTH ID CODE 1997",
    "KEY!SEX (SYMBOL) 1997",
    "KEY!BDATE M/Y (SYMBOL) 1997",
    "KEY!BDATE M/Y (SYMBOL) 1997",
    "CV_SAMPLE_TYPE 1997",
    "KEY!RACE_ETHNICITY (SYMBOL) 1997",
    "ASVAB_HIGH_DEGREE_EVER 1999"
)

# Use qnames rather than rnums
qnames = function(data) {
  names(data) <- c("PUBID_1997","KEY_SEX_1997","KEY_BDATE_M_1997","KEY_BDATE_Y_1997","CV_SAMPLE_TYPE_1997","KEY_RACE_ETHNICITY_1997","ASVAB_HIGH_DEGREE_EVER_1999")
  return(data)
}


#********************************************************************************************************

# Remove the '#' before the following line to create a data file called "categories" with value labels. 
categories <- vallabels(new_data)
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
categories <- qnames(categories)

# Produce summaries for the raw (uncategorized) data file
summary(new_data)
##    PUBID_1997    KEY_SEX_1997   KEY_BDATE_M_1997 KEY_BDATE_Y_1997
##  Min.   :   1   Min.   :1.000   Min.   : 1.000   Min.   :1980    
##  1st Qu.:2249   1st Qu.:1.000   1st Qu.: 3.000   1st Qu.:1981    
##  Median :4502   Median :1.000   Median : 7.000   Median :1982    
##  Mean   :4504   Mean   :1.488   Mean   : 6.556   Mean   :1982    
##  3rd Qu.:6758   3rd Qu.:2.000   3rd Qu.:10.000   3rd Qu.:1983    
##  Max.   :9022   Max.   :2.000   Max.   :12.000   Max.   :1984    
##                                                                  
##  CV_SAMPLE_TYPE_1997 KEY_RACE_ETHNICITY_1997 ASVAB_HIGH_DEGREE_EVER_1999
##  Min.   :0.0000      Min.   :1.000           Min.   : 1.000             
##  1st Qu.:1.0000      1st Qu.:1.000           1st Qu.: 1.000             
##  Median :1.0000      Median :4.000           Median : 1.000             
##  Mean   :0.7511      Mean   :2.788           Mean   : 7.935             
##  3rd Qu.:1.0000      3rd Qu.:4.000           3rd Qu.:20.000             
##  Max.   :1.0000      Max.   :4.000           Max.   :20.000             
##                                              NA's   :2073
# Remove the '#' before the following lines to produce summaries for the "categories" data file.

#categories <- vallabels(new_data)
summary(categories)
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
##         PUBID_1997           KEY_SEX_1997      KEY_BDATE_M_1997
##  1000 TO 1999: 999   Male          :4599   9: September: 839   
##  1 TO 999    : 998   Female        :4385   1: January  : 816   
##  4000 TO 4999: 998   No Information:   0   8: August   : 782   
##  2000 TO 2999: 997                         10: October : 765   
##  3000 TO 3999: 996                         11: November: 763   
##  5000 TO 5999: 996                         7: July     : 762   
##  (Other)     :3000                         (Other)     :4257   
##  KEY_BDATE_Y_1997      CV_SAMPLE_TYPE_1997
##  Min.   :1980     Cross-sectional:6748    
##  1st Qu.:1981     Oversample     :2236    
##  Median :1982                             
##  Mean   :1982                             
##  3rd Qu.:1983                             
##  Max.   :1984                             
##                                           
##               KEY_RACE_ETHNICITY_1997
##  Black                    :2335      
##  Hispanic                 :1901      
##  Mixed Race (Non-Hispanic):  83      
##  Non-Black / Non-Hispanic :4665      
##                                      
##                                      
##                                      
##                   ASVAB_HIGH_DEGREE_EVER_1999
##  Currently in High School       :3523        
##  None of the Above              :2281        
##  High School Senior             : 381        
##  Less than a High School Diploma: 338        
##  High School Diploma            :  92        
##  (Other)                        : 296        
##  NA's                           :2073
#********************************************************************************************************


require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
head(new_data)
##   PUBID_1997 KEY_SEX_1997 KEY_BDATE_M_1997 KEY_BDATE_Y_1997
## 1          1            2                9             1981
## 2          2            1                7             1982
## 3          3            2                9             1983
## 4          4            2                2             1981
## 5          5            1               10             1982
## 6          6            2                1             1982
##   CV_SAMPLE_TYPE_1997 KEY_RACE_ETHNICITY_1997 ASVAB_HIGH_DEGREE_EVER_1999
## 1                   1                       4                           1
## 2                   1                       2                           1
## 3                   1                       2                          20
## 4                   1                       2                           1
## 5                   1                       2                          NA
## 6                   1                       2                           1
sex_cv_race <- xtabs(~new_data$KEY_SEX_1997 + new_data$CV_SAMPLE_TYPE_1997 + new_data$KEY_RACE_ETHNICITY_1997)
sex_cv_race
## , , new_data$KEY_RACE_ETHNICITY_1997 = 1
## 
##                      new_data$CV_SAMPLE_TYPE_1997
## new_data$KEY_SEX_1997    0    1
##                     1  632  537
##                     2  622  544
## 
## , , new_data$KEY_RACE_ETHNICITY_1997 = 2
## 
##                      new_data$CV_SAMPLE_TYPE_1997
## new_data$KEY_SEX_1997    0    1
##                     1  508  469
##                     2  472  452
## 
## , , new_data$KEY_RACE_ETHNICITY_1997 = 3
## 
##                      new_data$CV_SAMPLE_TYPE_1997
## new_data$KEY_SEX_1997    0    1
##                     1    0   40
##                     2    2   41
## 
## , , new_data$KEY_RACE_ETHNICITY_1997 = 4
## 
##                      new_data$CV_SAMPLE_TYPE_1997
## new_data$KEY_SEX_1997    0    1
##                     1    0 2413
##                     2    0 2252
ftable(sex_cv_race)
##                                                    new_data$KEY_RACE_ETHNICITY_1997    1    2    3    4
## new_data$KEY_SEX_1997 new_data$CV_SAMPLE_TYPE_1997                                                     
## 1                     0                                                              632  508    0    0
##                       1                                                              537  469   40 2413
## 2                     0                                                              622  472    2    0
##                       1                                                              544  452   41 2252
summary(sex_cv_race)
## Call: xtabs(formula = ~new_data$KEY_SEX_1997 + new_data$CV_SAMPLE_TYPE_1997 + 
##     new_data$KEY_RACE_ETHNICITY_1997)
## Number of cases in table: 8984 
## Number of factors: 3 
## Test for independence of all factors:
##  Chisq = 3331, df = 10, p-value = 0
## The p-value is 0, which is less than alpha. Therefore, we reject the null hypothesis.