# Set working directory
# setwd()
getwd()
## [1] "/Users/aleigey/Desktop/WFED540fall15/fourvariables"
new_data <- read.table('fourvariables.dat', sep=' ')
names(new_data) <- c('R0000100','R9859700','R9859900','T7551500')

# Handle missing values
  new_data[new_data == -1] = NA  # Refused 
  new_data[new_data == -2] = NA  # Dont know 
  new_data[new_data == -3] = NA  # Invalid missing 
  new_data[new_data == -4] = NA  # Valid missing 
  new_data[new_data == -5] = NA  # Non-interview 

# If there are values not categorized they will be represented as NA
vallabels = function(data) {
  data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
  data$R9859700 <- factor(data$R9859700, levels=c(-8.0,-7.0,-6.0,0.0,1.0), labels=c("Invalid Carnegie credits or only pre-hs courses","All required courses taken but at least one course has missing credit info","Criterion for one subject not met","Does not meet all of the criteria","Meets all of the criteria"))
  data$R9859900 <- factor(data$R9859900, levels=c(-8.0,-7.0,-6.0,0.0,1.0), labels=c("Invalid Carnegie credits or only pre-hs courses","All required courses taken but at least one course has missing credit info","Criterion for one subject not met","Does not meet all of the criteria","Meets all of the criteria"))
  data$T7551500 <- cut(data$T7551500, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,10000.0,15000.0,20000.0,25000.0,50000.0,9.9999999E7), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999","10000 TO 14999","15000 TO 19999","20000 TO 24999","25000 TO 49999","50000 TO 99999999: 50000+"), right=FALSE)
  return(data)
}

varlabels <- c(    "PUBID - YTH ID CODE 1997",
    "TRANS_ACAD_CONC HSTR",
    "TRANS_VOC_CONC HSTR",
    "TTL INC WAGES, SALARY PDLI 2011"
)

# Use qnames rather than rnums
qnames = function(data) {
  names(data) <- c("PUBID_1997","TRANS_ACAD_CONC_HSTR","TRANS_VOC_CONC_HSTR","YINC_1700A_2011")
  return(data)
}

#********************************************************************************************************

# Remove the '#' before the following line to create a data file called "categories" with value labels. 
categories <- vallabels(new_data)

# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
categories <- qnames(categories)

# Produce summaries for the raw (uncategorized) data file
summary(new_data)
##    PUBID_1997   TRANS_ACAD_CONC_HSTR TRANS_VOC_CONC_HSTR YINC_1700A_2011 
##  Min.   :   1   Min.   :-8.0000      Min.   :-8.0000     Min.   :     0  
##  1st Qu.:2249   1st Qu.: 0.0000      1st Qu.: 0.0000     1st Qu.: 20000  
##  Median :4502   Median : 0.0000      Median : 0.0000     Median : 30000  
##  Mean   :4504   Mean   :-0.1847      Mean   : 0.1171     Mean   : 40970  
##  3rd Qu.:6758   3rd Qu.: 1.0000      3rd Qu.: 1.0000     3rd Qu.: 45250  
##  Max.   :9022   Max.   : 1.0000      Max.   : 1.0000     Max.   :320000  
##                 NA's   :2752         NA's   :2752        NA's   :8824
# Remove the '#' before the following lines to produce summaries for the "categories" data file.
#categories <- vallabels(new_data)
#summary(categories)

#************************************************************************************************************