# Set working directory
# setwd()
getwd()
## [1] "/Users/aleigey/Desktop/WFED540fall15/fourvariables"
new_data <- read.table('fourvariables.dat', sep=' ')
names(new_data) <- c('R0000100','R9859700','R9859900','T7551500')
# Handle missing values
new_data[new_data == -1] = NA # Refused
new_data[new_data == -2] = NA # Dont know
new_data[new_data == -3] = NA # Invalid missing
new_data[new_data == -4] = NA # Valid missing
new_data[new_data == -5] = NA # Non-interview
# If there are values not categorized they will be represented as NA
vallabels = function(data) {
data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
data$R9859700 <- factor(data$R9859700, levels=c(-8.0,-7.0,-6.0,0.0,1.0), labels=c("Invalid Carnegie credits or only pre-hs courses","All required courses taken but at least one course has missing credit info","Criterion for one subject not met","Does not meet all of the criteria","Meets all of the criteria"))
data$R9859900 <- factor(data$R9859900, levels=c(-8.0,-7.0,-6.0,0.0,1.0), labels=c("Invalid Carnegie credits or only pre-hs courses","All required courses taken but at least one course has missing credit info","Criterion for one subject not met","Does not meet all of the criteria","Meets all of the criteria"))
data$T7551500 <- cut(data$T7551500, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,10000.0,15000.0,20000.0,25000.0,50000.0,9.9999999E7), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999","10000 TO 14999","15000 TO 19999","20000 TO 24999","25000 TO 49999","50000 TO 99999999: 50000+"), right=FALSE)
return(data)
}
varlabels <- c( "PUBID - YTH ID CODE 1997",
"TRANS_ACAD_CONC HSTR",
"TRANS_VOC_CONC HSTR",
"TTL INC WAGES, SALARY PDLI 2011"
)
# Use qnames rather than rnums
qnames = function(data) {
names(data) <- c("PUBID_1997","TRANS_ACAD_CONC_HSTR","TRANS_VOC_CONC_HSTR","YINC_1700A_2011")
return(data)
}
#********************************************************************************************************
# Remove the '#' before the following line to create a data file called "categories" with value labels.
categories <- vallabels(new_data)
# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
categories <- qnames(categories)
# Produce summaries for the raw (uncategorized) data file
summary(new_data)
## PUBID_1997 TRANS_ACAD_CONC_HSTR TRANS_VOC_CONC_HSTR YINC_1700A_2011
## Min. : 1 Min. :-8.0000 Min. :-8.0000 Min. : 0
## 1st Qu.:2249 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 20000
## Median :4502 Median : 0.0000 Median : 0.0000 Median : 30000
## Mean :4504 Mean :-0.1847 Mean : 0.1171 Mean : 40970
## 3rd Qu.:6758 3rd Qu.: 1.0000 3rd Qu.: 1.0000 3rd Qu.: 45250
## Max. :9022 Max. : 1.0000 Max. : 1.0000 Max. :320000
## NA's :2752 NA's :2752 NA's :8824
# Remove the '#' before the following lines to produce summaries for the "categories" data file.
#categories <- vallabels(new_data)
#summary(categories)
#************************************************************************************************************