#getwd()
new_data <- read.table('class11.dat', sep=' ')
names(new_data) <- c('R0000100','R0536300','R0536401','R0536402','R1235800','R1482600','R9700300')
# Handle missing values
new_data[new_data == -1] = NA # Refused
new_data[new_data == -2] = NA # Dont know
new_data[new_data == -3] = NA # Invalid missing
new_data[new_data == -4] = NA # Valid missing
new_data[new_data == -5] = NA # Non-interview
# If there are values not categorized they will be represented as NA
vallabels = function(data) {
data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
data$R0536300 <- factor(data$R0536300, levels=c(1.0,2.0,0.0), labels=c("Male","Female","No Information"))
data$R0536401 <- factor(data$R0536401, levels=c(1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0), labels=c("1: January","2: February","3: March","4: April","5: May","6: June","7: July","8: August","9: September","10: October","11: November","12: December"))
data$R1235800 <- factor(data$R1235800, levels=c(1.0,0.0), labels=c("Cross-sectional","Oversample"))
data$R1482600 <- factor(data$R1482600, levels=c(1.0,2.0,3.0,4.0), labels=c("Black","Hispanic","Mixed Race (Non-Hispanic)","Non-Black / Non-Hispanic"))
data$R9700300 <- factor(data$R9700300, levels=c(1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0), labels=c("Currently in High School","High School Senior","Less than a High School Diploma","Credential Near Completion","Credential Near Completion","Occupational program certificate","Correspondence school diploma","Home Study Diploma","Adult education Diploma","High school certificate of attendance","Completed one semester of college","High School Diploma","Associate degree","Professional Nursing Degree","Bachelor's degree","First professional degree","Master's degree","Post Master's degree","Doctorate Degree","None of the Above"))
return(data)
}
varlabels <- c( "PUBID - YTH ID CODE 1997",
"KEY!SEX (SYMBOL) 1997",
"KEY!BDATE M/Y (SYMBOL) 1997",
"KEY!BDATE M/Y (SYMBOL) 1997",
"CV_SAMPLE_TYPE 1997",
"KEY!RACE_ETHNICITY (SYMBOL) 1997",
"ASVAB_HIGH_DEGREE_EVER 1999"
)
# Use qnames rather than rnums
qnames = function(data) {
names(data) <- c("PUBID_1997","KEY_SEX_1997","KEY_BDATE_M_1997","KEY_BDATE_Y_1997","CV_SAMPLE_TYPE_1997","KEY_RACE_ETHNICITY_1997","ASVAB_HIGH_DEGREE_EVER_1999")
return(data)
}
#********************************************************************************************************
# Remove the '#' before the following line to create a data file called "categories" with value labels.
categories <- vallabels(new_data)
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
categories <- qnames(categories)
# Produce summaries for the raw (uncategorized) data file
summary(new_data)
## PUBID_1997 KEY_SEX_1997 KEY_BDATE_M_1997 KEY_BDATE_Y_1997
## Min. : 1 Min. :1.000 Min. : 1.000 Min. :1980
## 1st Qu.:2249 1st Qu.:1.000 1st Qu.: 3.000 1st Qu.:1981
## Median :4502 Median :1.000 Median : 7.000 Median :1982
## Mean :4504 Mean :1.488 Mean : 6.556 Mean :1982
## 3rd Qu.:6758 3rd Qu.:2.000 3rd Qu.:10.000 3rd Qu.:1983
## Max. :9022 Max. :2.000 Max. :12.000 Max. :1984
##
## CV_SAMPLE_TYPE_1997 KEY_RACE_ETHNICITY_1997 ASVAB_HIGH_DEGREE_EVER_1999
## Min. :0.0000 Min. :1.000 Min. : 1.000
## 1st Qu.:1.0000 1st Qu.:1.000 1st Qu.: 1.000
## Median :1.0000 Median :4.000 Median : 1.000
## Mean :0.7511 Mean :2.788 Mean : 7.935
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:20.000
## Max. :1.0000 Max. :4.000 Max. :20.000
## NA's :2073
# Remove the '#' before the following lines to produce summaries for the "categories" data file.
#categories <- vallabels(new_data)
summary(categories)
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## PUBID_1997 KEY_SEX_1997 KEY_BDATE_M_1997
## 1000 TO 1999: 999 Male :4599 9: September: 839
## 1 TO 999 : 998 Female :4385 1: January : 816
## 4000 TO 4999: 998 No Information: 0 8: August : 782
## 2000 TO 2999: 997 10: October : 765
## 3000 TO 3999: 996 11: November: 763
## 5000 TO 5999: 996 7: July : 762
## (Other) :3000 (Other) :4257
## KEY_BDATE_Y_1997 CV_SAMPLE_TYPE_1997
## Min. :1980 Cross-sectional:6748
## 1st Qu.:1981 Oversample :2236
## Median :1982
## Mean :1982
## 3rd Qu.:1983
## Max. :1984
##
## KEY_RACE_ETHNICITY_1997
## Black :2335
## Hispanic :1901
## Mixed Race (Non-Hispanic): 83
## Non-Black / Non-Hispanic :4665
##
##
##
## ASVAB_HIGH_DEGREE_EVER_1999
## Currently in High School :3523
## None of the Above :2281
## High School Senior : 381
## Less than a High School Diploma: 338
## High School Diploma : 92
## (Other) : 296
## NA's :2073
#********************************************************************************************************
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
head(new_data)
## PUBID_1997 KEY_SEX_1997 KEY_BDATE_M_1997 KEY_BDATE_Y_1997
## 1 1 2 9 1981
## 2 2 1 7 1982
## 3 3 2 9 1983
## 4 4 2 2 1981
## 5 5 1 10 1982
## 6 6 2 1 1982
## CV_SAMPLE_TYPE_1997 KEY_RACE_ETHNICITY_1997 ASVAB_HIGH_DEGREE_EVER_1999
## 1 1 4 1
## 2 1 2 1
## 3 1 2 20
## 4 1 2 1
## 5 1 2 NA
## 6 1 2 1
sex_cv_race <- xtabs(~new_data$KEY_SEX_1997 + new_data$CV_SAMPLE_TYPE_1997 + new_data$KEY_RACE_ETHNICITY_1997)
sex_cv_race
## , , new_data$KEY_RACE_ETHNICITY_1997 = 1
##
## new_data$CV_SAMPLE_TYPE_1997
## new_data$KEY_SEX_1997 0 1
## 1 632 537
## 2 622 544
##
## , , new_data$KEY_RACE_ETHNICITY_1997 = 2
##
## new_data$CV_SAMPLE_TYPE_1997
## new_data$KEY_SEX_1997 0 1
## 1 508 469
## 2 472 452
##
## , , new_data$KEY_RACE_ETHNICITY_1997 = 3
##
## new_data$CV_SAMPLE_TYPE_1997
## new_data$KEY_SEX_1997 0 1
## 1 0 40
## 2 2 41
##
## , , new_data$KEY_RACE_ETHNICITY_1997 = 4
##
## new_data$CV_SAMPLE_TYPE_1997
## new_data$KEY_SEX_1997 0 1
## 1 0 2413
## 2 0 2252
ftable(sex_cv_race)
## new_data$KEY_RACE_ETHNICITY_1997 1 2 3 4
## new_data$KEY_SEX_1997 new_data$CV_SAMPLE_TYPE_1997
## 1 0 632 508 0 0
## 1 537 469 40 2413
## 2 0 622 472 2 0
## 1 544 452 41 2252
summary(sex_cv_race)
## Call: xtabs(formula = ~new_data$KEY_SEX_1997 + new_data$CV_SAMPLE_TYPE_1997 +
## new_data$KEY_RACE_ETHNICITY_1997)
## Number of cases in table: 8984
## Number of factors: 3
## Test for independence of all factors:
## Chisq = 3331, df = 10, p-value = 0
## The p-value is 0, which is less than alpha. Therefore, we reject the null hypothesis.