#WFED540 Class 12 Practice
##By: Andrew Leigey
##Date: November 6, 2015
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(ggvis)
## Loading required package: ggvis
require(magrittr)
## Loading required package: magrittr
new_data <- read.table('Class12_3_Variables.dat', sep=' ')
names(new_data) <- c('R0000100','R0536300','R9859700','R9859900')
# Handle missing values
new_data[new_data == -1] = NA # Refused
new_data[new_data == -2] = NA # Dont know
new_data[new_data == -3] = NA # Invalid missing
new_data[new_data == -4] = NA # Valid missing
new_data[new_data == -5] = NA # Non-interview
# If there are values not categorized they will be represented as NA
vallabels = function(data) {
data$R0000100 <- cut(data$R0000100, c(0.0,1.0,1000.0,2000.0,3000.0,4000.0,5000.0,6000.0,7000.0,8000.0,9000.0,9999.0), labels=c("0","1 TO 999","1000 TO 1999","2000 TO 2999","3000 TO 3999","4000 TO 4999","5000 TO 5999","6000 TO 6999","7000 TO 7999","8000 TO 8999","9000 TO 9999"), right=FALSE)
data$R0536300 <- factor(data$R0536300, levels=c(1.0,2.0,0.0), labels=c("Male","Female","No Information"))
data$R9859700 <- factor(data$R9859700, levels=c(-8.0,-7.0,-6.0,0.0,1.0), labels=c("Invalid Carnegie credits or only pre-hs courses","All required courses taken but at least one course has missing credit info","Criterion for one subject not met","Does not meet all of the criteria","Meets all of the criteria"))
data$R9859900 <- factor(data$R9859900, levels=c(-8.0,-7.0,-6.0,0.0,1.0), labels=c("Invalid Carnegie credits or only pre-hs courses","All required courses taken but at least one course has missing credit info","Criterion for one subject not met","Does not meet all of the criteria","Meets all of the criteria"))
return(data)
}
varlabels <- c( "PUBID - YTH ID CODE 1997",
"KEY!SEX (SYMBOL) 1997",
"TRANS_ACAD_CONC HSTR",
"TRANS_VOC_CONC HSTR"
)
# Use qnames rather than rnums
qnames = function(data) {
names(data) <- c("PUBID_1997","sex","acdconc","vocconc")
return(data)
}
#*************************************************************************************
# Remove the '#' before the following line to create a data file called "categories" with value labels.
#categories <- vallabels(new_data)
# Remove the '#' before the following lines to rename variables using Qnames instead of Reference Numbers
new_data <- qnames(new_data)
#categories <- qnames(categories)
# Produce summaries for the raw (uncategorized) data file
summary(new_data)
## PUBID_1997 sex acdconc vocconc
## Min. : 1 Min. :1.000 Min. :-8.0000 Min. :-8.0000
## 1st Qu.:2249 1st Qu.:1.000 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median :4502 Median :1.000 Median : 0.0000 Median : 0.0000
## Mean :4504 Mean :1.488 Mean :-0.1847 Mean : 0.1171
## 3rd Qu.:6758 3rd Qu.:2.000 3rd Qu.: 1.0000 3rd Qu.: 1.0000
## Max. :9022 Max. :2.000 Max. : 1.0000 Max. : 1.0000
## NA's :2752 NA's :2752
# Remove the '#' before the following lines to produce summaries for the "categories" data file.
#categories <- vallabels(new_data)
#summary(categories)
#************************************************************************************************************
#Here I am just taking a look at the data with glimpse and summarise.
glimpse(new_data)
## Observations: 8,984
## Variables: 4
## $ PUBID_1997 (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ sex (int) 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, ...
## $ acdconc (int) 1, -7, NA, 0, 1, 0, NA, 1, 1, 1, 0, 0, 0, NA, 0, 0,...
## $ vocconc (int) 0, 1, NA, 1, 0, 0, NA, 0, 0, 0, 0, 0, 0, NA, 1, 0, ...
summarise(new_data)
## data frame with 0 columns and 0 rows
#Here I am viewing the data in a small table
tbl_df(new_data)
## Source: local data frame [8,984 x 4]
##
## PUBID_1997 sex acdconc vocconc
## (int) (int) (int) (int)
## 1 1 2 1 0
## 2 2 1 -7 1
## 3 3 2 NA NA
## 4 4 2 0 1
## 5 5 1 1 0
## 6 6 2 0 0
## 7 7 1 NA NA
## 8 8 2 1 0
## 9 9 1 1 0
## 10 10 1 1 0
## .. ... ... ... ...
#Next I filter out the NA's and anything below a 0 in the academic concecentrators
#and vocational concentrators. I also filtered out any invalid responses on the
#gender variable as well.
new_datafltr<-filter(new_data,sex>0,acdconc>-6, vocconc>-6)
#Here I am looking at the filtered data.
tbl_df(new_datafltr)
## Source: local data frame [5,697 x 4]
##
## PUBID_1997 sex acdconc vocconc
## (int) (int) (int) (int)
## 1 1 2 1 0
## 2 4 2 0 1
## 3 5 1 1 0
## 4 6 2 0 0
## 5 8 2 1 0
## 6 9 1 1 0
## 7 10 1 1 0
## 8 11 2 0 0
## 9 12 1 0 0
## 10 13 1 0 0
## .. ... ... ... ...
#Here I am putting the three variables into a comparison table for a chi-square test.
new_datafltrtable<-xtabs(~new_datafltr$sex + new_datafltr$acdconc + new_datafltr$vocconc)
#Heres I am looking at the new table with the data
ftable(new_datafltrtable)
## new_datafltr$vocconc 0 1
## new_datafltr$sex new_datafltr$acdconc
## 1 0 1122 623
## 1 655 434
## 2 0 1143 412
## 1 939 369
#Here I am performing the chi-square test on the hypothesis
summary(new_datafltrtable)
## Call: xtabs(formula = ~new_datafltr$sex + new_datafltr$acdconc + new_datafltr$vocconc)
## Number of cases in table: 5697
## Number of factors: 3
## Test for independence of all factors:
## Chisq = 99.92, df = 4, p-value = 1.021e-20
```