pnorm(-0.66,9,4.6,lower.tail = T) ##算出来的是one-tail,题目要求2 tails则 0.0178*2
## [1] 0.01786442
`
Interpretation
T-value=coefficient/standard error P-value 用tcdf(计算机) 或者 online calculator: https://www.socscistatistics.com/pvalues/tdistribution.aspx
If the p-value associated with this t-statistic is less than your significance level, you conclude that the coefficient is statistically significant at your significant level
example: l. A regression coefficient is estimated to be equal to 4.96 with standard error 6.2 ; there are 6 degrees of freedom. What is the p-value (from the t-statistic) against the null hypothesis of zero?
set up sinigficant level=0.05 T-obs = 4.96/6.2=0.8 use online calculator: two-tails test
load("~/Desktop/.RData")
attach(dat_NYC)
## The following object is masked _by_ .GlobalEnv:
##
## OWNCOST
dat_1<-data.frame(acs2017_ny[,c("EDUCD","TRANWORK")])
work_home<-table(dat_1$EDUC ,dat_1$TRANWORK==70)
work_home
##
## FALSE TRUE
## N/A or no schooling 0 0
## N/A 5569 0
## No schooling completed 6268 42
## Nursery school to grade 4 0 0
## Nursery school, preschool 2760 0
## Kindergarten 2247 0
## Grade 1, 2, 3, or 4 0 0
## Grade 1 2131 0
## Grade 2 2273 3
## Grade 3 2417 1
## Grade 4 2405 3
## Grade 5, 6, 7, or 8 0 0
## Grade 5 or 6 0 0
## Grade 5 2832 3
## Grade 6 3347 18
## Grade 7 or 8 0 0
## Grade 7 2835 4
## Grade 8 3983 57
## Grade 9 4068 20
## Grade 10 4622 22
## Grade 11 5286 51
## Grade 12 0 0
## 12th grade, no diploma 3839 40
## High school graduate or GED 0 0
## Regular high school diploma 35117 572
## GED or alternative credential 6368 97
## Some college, but less than 1 year 8903 183
## 1 year of college 0 0
## 1 or more years of college credit, no degree 19434 513
## 2 years of college 0 0
## Associate's degree, type not specified 13736 329
## Associate's degree, occupational program 0 0
## Associate's degree, academic program 0 0
## 3 years of college 0 0
## 4 years of college 0 0
## Bachelor's degree 29538 1264
## 5+ years of college 0 0
## 6 years of college (6+ in 1960-1970) 0 0
## 7 years of college 0 0
## 8+ years of college 0 0
## Master's degree 16352 658
## Professional degree beyond a bachelor's degree 3886 165
## Doctoral degree 2201 123
## Missing 0 0
summary(work_home)
## Number of cases in table: 196585
## Number of factors: 2
## Test for independence of all factors:
## Chisq = NaN, df = 43, p-value = NA
## Chi-squared approximation may be incorrect
EDUCATION ATTAINMENTS: P(above 4-year-degree working from home)=(1264+658+165+123)/(4168)=0.5295
Commute_subway<-table(dat_1$EDUCD,dat_1$TRANWORK==33)
Commute_subway
##
## FALSE TRUE
## N/A or no schooling 0 0
## N/A 5569 0
## No schooling completed 6085 225
## Nursery school to grade 4 0 0
## Nursery school, preschool 2758 2
## Kindergarten 2244 3
## Grade 1, 2, 3, or 4 0 0
## Grade 1 2127 4
## Grade 2 2268 8
## Grade 3 2393 25
## Grade 4 2382 26
## Grade 5, 6, 7, or 8 0 0
## Grade 5 or 6 0 0
## Grade 5 2786 49
## Grade 6 3220 145
## Grade 7 or 8 0 0
## Grade 7 2795 44
## Grade 8 3923 117
## Grade 9 3960 128
## Grade 10 4510 134
## Grade 11 5159 178
## Grade 12 0 0
## 12th grade, no diploma 3600 279
## High school graduate or GED 0 0
## Regular high school diploma 33517 2172
## GED or alternative credential 6000 465
## Some college, but less than 1 year 8577 509
## 1 year of college 0 0
## 1 or more years of college credit, no degree 18462 1485
## 2 years of college 0 0
## Associate's degree, type not specified 13112 953
## Associate's degree, occupational program 0 0
## Associate's degree, academic program 0 0
## 3 years of college 0 0
## 4 years of college 0 0
## Bachelor's degree 26045 4757
## 5+ years of college 0 0
## 6 years of college (6+ in 1960-1970) 0 0
## 7 years of college 0 0
## 8+ years of college 0 0
## Master's degree 14893 2117
## Professional degree beyond a bachelor's degree 3466 585
## Doctoral degree 2053 271
## Missing 0 0
summary(Commute_subway)
## Number of cases in table: 196585
## Number of factors: 2
## Test for independence of all factors:
## Chisq = NaN, df = 43, p-value = NA
## Chi-squared approximation may be incorrect
dat_2 <- data.frame(acs2017_ny[,c("SEX","TRANWORK")])
fem_home<-table(dat_2$SEX,dat_2$TRANWORK==70)
fem_home
##
## FALSE TRUE
## Male 93118 2104
## Female 99299 2064
summary(fem_home)
## Number of cases in table: 196585
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 7.108, df = 1, p-value = 0.007675
detach(dat_NYC)
P(fem_home)=2064/(2104+2064)= P(m_home)=2104/(2104+2064)=
attach(acs2017_ny)
## The following objects are masked _by_ .GlobalEnv:
##
## Commute_subway, OWNCOST
use_varb <- (AGE >= 23) & (AGE <= 62) & (LABFORCE == 2) & (WKSWORK2 > 4) & (UHRSWORK >= 35)
dat_use <- subset(acs2017_ny,use_varb)
detach()
attach(dat_use)
## The following objects are masked _by_ .GlobalEnv:
##
## Commute_subway, OWNCOST
##2. other demographic effects(age and race/ethnicity)
eff2_demog <- lm(TRANWORK~ AGE + RACE + + AfAm + Asian + Amindian + race_oth + Hispanic,data=dat_use )
summary(eff2_demog)
##
## Call:
## lm(formula = TRANWORK ~ AGE + RACE + +AfAm + Asian + Amindian +
## race_oth + Hispanic, data = dat_use)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.710 -9.276 -7.296 10.057 53.684
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.403352 0.290556 70.222 < 2e-16 ***
## AGE -0.090008 0.005713 -15.756 < 2e-16 ***
## RACE 1.573371 0.110851 14.194 < 2e-16 ***
## AfAm 2.903284 0.228269 12.719 < 2e-16 ***
## Asian 3.590347 0.292619 12.270 < 2e-16 ***
## Amindian -3.316980 1.129212 -2.937 0.00331 **
## race_oth -6.651436 0.602229 -11.045 < 2e-16 ***
## Hispanic 3.244702 0.220860 14.691 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.63 on 59426 degrees of freedom
## Multiple R-squared: 0.03319, Adjusted R-squared: 0.03307
## F-statistic: 291.4 on 7 and 59426 DF, p-value: < 2.2e-16
load("~/Desktop/.RData")
attach(acs2017_ny)
## The following objects are masked _by_ .GlobalEnv:
##
## Commute_subway, OWNCOST
## The following objects are masked from dat_use:
##
## AfAm, AGE, Amindian, ANCESTR1, ANCESTR1D, ANCESTR2, ANCESTR2D,
## Asian, below_150poverty, below_200poverty, below_povertyline, BPL,
## BPLD, BUILTYR2, CITIZEN, CLASSWKR, CLASSWKRD, Commute_bus,
## Commute_car, Commute_other, Commute_rail, Commute_subway, COSTELEC,
## COSTFUEL, COSTGAS, COSTWATR, DEGFIELD, DEGFIELD2, DEGFIELD2D,
## DEGFIELDD, DEPARTS, EDUC, educ_advdeg, educ_college, educ_hs,
## educ_nohs, educ_somecoll, EDUCD, EMPSTAT, EMPSTATD, FAMSIZE,
## female, foodstamps, FOODSTMP, FTOTINC, FUELHEAT, GQ,
## has_AnyHealthIns, has_PvtHealthIns, HCOVANY, HCOVPRIV, HHINCOME,
## Hisp_Cuban, Hisp_DomR, Hisp_Mex, Hisp_PR, HISPAN, HISPAND,
## Hispanic, in_Bronx, in_Brooklyn, in_Manhattan, in_Nassau, in_NYC,
## in_Queens, in_StatenI, in_Westchester, INCTOT, INCWAGE, IND,
## LABFORCE, LINGISOL, MARST, MIGCOUNTY1, MIGPLAC1, MIGPUMA1,
## MIGRATE1, MIGRATE1D, MORTGAGE, NCHILD, NCHLT5, OCC, OWNCOST,
## OWNERSHP, OWNERSHPD, POVERTY, PUMA, PWPUMA00, RACE, race_oth,
## RACED, RELATE, RELATED, RENT, ROOMS, SCHOOL, SEX, SSMC, TRANTIME,
## TRANWORK, UHRSWORK, UNITSSTR, unmarried, veteran, VETSTAT,
## VETSTATD, white, WKSWORK2, YRSUSA1
use_varb <- (AGE >= 23) & (AGE <= 62) & (LABFORCE == 2) & (WKSWORK2 > 4) & (UHRSWORK >= 35)
dat_use_1 <- subset(acs2017_ny,use_varb)
detach()
attach(dat_use_1)
## The following objects are masked _by_ .GlobalEnv:
##
## Commute_subway, OWNCOST
##
## The following objects are masked from dat_use:
##
## AfAm, AGE, Amindian, ANCESTR1, ANCESTR1D, ANCESTR2, ANCESTR2D,
## Asian, below_150poverty, below_200poverty, below_povertyline, BPL,
## BPLD, BUILTYR2, CITIZEN, CLASSWKR, CLASSWKRD, Commute_bus,
## Commute_car, Commute_other, Commute_rail, Commute_subway, COSTELEC,
## COSTFUEL, COSTGAS, COSTWATR, DEGFIELD, DEGFIELD2, DEGFIELD2D,
## DEGFIELDD, DEPARTS, EDUC, educ_advdeg, educ_college, educ_hs,
## educ_nohs, educ_somecoll, EDUCD, EMPSTAT, EMPSTATD, FAMSIZE,
## female, foodstamps, FOODSTMP, FTOTINC, FUELHEAT, GQ,
## has_AnyHealthIns, has_PvtHealthIns, HCOVANY, HCOVPRIV, HHINCOME,
## Hisp_Cuban, Hisp_DomR, Hisp_Mex, Hisp_PR, HISPAN, HISPAND,
## Hispanic, in_Bronx, in_Brooklyn, in_Manhattan, in_Nassau, in_NYC,
## in_Queens, in_StatenI, in_Westchester, INCTOT, INCWAGE, IND,
## LABFORCE, LINGISOL, MARST, MIGCOUNTY1, MIGPLAC1, MIGPUMA1,
## MIGRATE1, MIGRATE1D, MORTGAGE, NCHILD, NCHLT5, OCC, OWNCOST,
## OWNERSHP, OWNERSHPD, POVERTY, PUMA, PWPUMA00, RACE, race_oth,
## RACED, RELATE, RELATED, RENT, ROOMS, SCHOOL, SEX, SSMC, TRANTIME,
## TRANWORK, UHRSWORK, UNITSSTR, unmarried, veteran, VETSTAT,
## VETSTATD, white, WKSWORK2, YRSUSA1
tran_educ<- lm(TRANWORK~ AGE + female + AfAm + Asian + Amindian + race_oth + Hispanic + educ_hs + educ_somecoll + educ_college + educ_advdeg, data= dat_use_1)
summary(tran_educ)
##
## Call:
## lm(formula = TRANWORK ~ AGE + female + AfAm + Asian + Amindian +
## race_oth + Hispanic + educ_hs + educ_somecoll + educ_college +
## educ_advdeg, data = dat_use_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.092 -10.274 -5.884 10.387 55.584
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.704588 0.413328 54.931 < 2e-16 ***
## AGE -0.067777 0.005761 -11.766 < 2e-16 ***
## female -0.090216 0.129105 -0.699 0.4847
## AfAm 4.849737 0.205232 23.631 < 2e-16 ***
## Asian 4.968136 0.254265 19.539 < 2e-16 ***
## Amindian 0.149781 1.104305 0.136 0.8921
## race_oth 1.340083 0.235544 5.689 1.28e-08 ***
## Hispanic 4.322205 0.217765 19.848 < 2e-16 ***
## educ_hs -3.702825 0.322274 -11.490 < 2e-16 ***
## educ_somecoll -3.996220 0.330571 -12.089 < 2e-16 ***
## educ_college 0.709981 0.326812 2.172 0.0298 *
## educ_advdeg 0.248176 0.335923 0.739 0.4600
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.51 on 59422 degrees of freedom
## Multiple R-squared: 0.04768, Adjusted R-squared: 0.0475
## F-statistic: 270.5 on 11 and 59422 DF, p-value: < 2.2e-16
plot(tran_educ)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
stargazer(tran_educ, type="text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## TRANWORK
## -----------------------------------------------
## AGE -0.068***
## (0.006)
##
## female -0.090
## (0.129)
##
## AfAm 4.850***
## (0.205)
##
## Asian 4.968***
## (0.254)
##
## Amindian 0.150
## (1.104)
##
## race_oth 1.340***
## (0.236)
##
## Hispanic 4.322***
## (0.218)
##
## educ_hs -3.703***
## (0.322)
##
## educ_somecoll -3.996***
## (0.331)
##
## educ_college 0.710**
## (0.327)
##
## educ_advdeg 0.248
## (0.336)
##
## Constant 22.705***
## (0.413)
##
## -----------------------------------------------
## Observations 59,434
## R2 0.048
## Adjusted R2 0.048
## Residual Std. Error 15.511 (df = 59422)
## F Statistic 270.451*** (df = 11; 59422)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
NNobs <- length(TRANWORK)
set.seed(12345)
graph_obs <- (runif(NNobs) < 0.1)
dat_graph <-subset (dat_use, graph_obs)
to_be_predicted <- data.frame(AGE = 23:65, female = 1,AfAm = 1,Asian = 0,Amindian = 1, race_oth = 1,Hispanic = 1,educ_hs = 0, educ_somecoll = 0,educ_college = 1,educ_advdeg = 0)
to_be_predicted$yhat <- predict(tran_educ, newdata = to_be_predicted)
summary(to_be_predicted$yhat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29.58 30.29 31.00 31.00 31.72 32.43
plot(yhat ~ educ_college, data = to_be_predicted)
lines(yhat~educ_advdeg,data=to_be_predicted)
detach()
Create a k-nn model to try to predict how a person commutes to work. (Worry a bit about the zero values of TRANWORK since that typically means they’re not working.) How useful is this model at predicting? What are some of the important variables in this prediction?
## Here we analyze "UHRSWORK"(usual houors worked per week), TRANTIME and INCTOT as possible classifications to predict people's means of commuting.
## 1.设置dataframe
dat_NYC <- subset(acs2017_ny, (acs2017_ny$in_NYC == 1)&(acs2017_ny$AGE > 18) & (acs2017_ny$AGE < 66))
attach(dat_NYC)
## The following objects are masked _by_ .GlobalEnv:
##
## Commute_subway, OWNCOST
## The following objects are masked from dat_use_1:
##
## AfAm, AGE, Amindian, ANCESTR1, ANCESTR1D, ANCESTR2, ANCESTR2D,
## Asian, below_150poverty, below_200poverty, below_povertyline, BPL,
## BPLD, BUILTYR2, CITIZEN, CLASSWKR, CLASSWKRD, Commute_bus,
## Commute_car, Commute_other, Commute_rail, Commute_subway, COSTELEC,
## COSTFUEL, COSTGAS, COSTWATR, DEGFIELD, DEGFIELD2, DEGFIELD2D,
## DEGFIELDD, DEPARTS, EDUC, educ_advdeg, educ_college, educ_hs,
## educ_nohs, educ_somecoll, EDUCD, EMPSTAT, EMPSTATD, FAMSIZE,
## female, foodstamps, FOODSTMP, FTOTINC, FUELHEAT, GQ,
## has_AnyHealthIns, has_PvtHealthIns, HCOVANY, HCOVPRIV, HHINCOME,
## Hisp_Cuban, Hisp_DomR, Hisp_Mex, Hisp_PR, HISPAN, HISPAND,
## Hispanic, in_Bronx, in_Brooklyn, in_Manhattan, in_Nassau, in_NYC,
## in_Queens, in_StatenI, in_Westchester, INCTOT, INCWAGE, IND,
## LABFORCE, LINGISOL, MARST, MIGCOUNTY1, MIGPLAC1, MIGPUMA1,
## MIGRATE1, MIGRATE1D, MORTGAGE, NCHILD, NCHLT5, OCC, OWNCOST,
## OWNERSHP, OWNERSHPD, POVERTY, PUMA, PWPUMA00, RACE, race_oth,
## RACED, RELATE, RELATED, RENT, ROOMS, SCHOOL, SEX, SSMC, TRANTIME,
## TRANWORK, UHRSWORK, UNITSSTR, unmarried, veteran, VETSTAT,
## VETSTATD, white, WKSWORK2, YRSUSA1
## The following objects are masked from dat_use:
##
## AfAm, AGE, Amindian, ANCESTR1, ANCESTR1D, ANCESTR2, ANCESTR2D,
## Asian, below_150poverty, below_200poverty, below_povertyline, BPL,
## BPLD, BUILTYR2, CITIZEN, CLASSWKR, CLASSWKRD, Commute_bus,
## Commute_car, Commute_other, Commute_rail, Commute_subway, COSTELEC,
## COSTFUEL, COSTGAS, COSTWATR, DEGFIELD, DEGFIELD2, DEGFIELD2D,
## DEGFIELDD, DEPARTS, EDUC, educ_advdeg, educ_college, educ_hs,
## educ_nohs, educ_somecoll, EDUCD, EMPSTAT, EMPSTATD, FAMSIZE,
## female, foodstamps, FOODSTMP, FTOTINC, FUELHEAT, GQ,
## has_AnyHealthIns, has_PvtHealthIns, HCOVANY, HCOVPRIV, HHINCOME,
## Hisp_Cuban, Hisp_DomR, Hisp_Mex, Hisp_PR, HISPAN, HISPAND,
## Hispanic, in_Bronx, in_Brooklyn, in_Manhattan, in_Nassau, in_NYC,
## in_Queens, in_StatenI, in_Westchester, INCTOT, INCWAGE, IND,
## LABFORCE, LINGISOL, MARST, MIGCOUNTY1, MIGPLAC1, MIGPUMA1,
## MIGRATE1, MIGRATE1D, MORTGAGE, NCHILD, NCHLT5, OCC, OWNCOST,
## OWNERSHP, OWNERSHPD, POVERTY, PUMA, PWPUMA00, RACE, race_oth,
## RACED, RELATE, RELATED, RENT, ROOMS, SCHOOL, SEX, SSMC, TRANTIME,
## TRANWORK, UHRSWORK, UNITSSTR, unmarried, veteran, VETSTAT,
## VETSTATD, white, WKSWORK2, YRSUSA1
commute_f <- factor((dat_NYC$Commute_bus+2*dat_NYC$Commute_rail+3*dat_NYC$Commute_subway+4*dat_NYC$Commute_car+5*dat_NYC$Commute_other),levels = c(1,2,3,4,5), labels = c("bus","rail","subway","car","other"))
## 2.标准化数据
norm_varb <- function(X_in) {
(max(X_in, na.rm = TRUE) - X_in)/( max(X_in, na.rm = TRUE) - min(X_in, na.rm = TRUE) )
}
is.na(TRANWORK)<-which(TRANWORK==0) ## 去除tranwork中为0的数据
tran_time <- TRANTIME+UHRSWORK
norm_tran_time <-norm_varb(tran_time) ## 对研究变量标准化
fam_inc <- INCTOT ## 选定研究的变量
norm_fam_inc<- norm_varb(fam_inc)
dat_use_prelim_2 <-data.frame(norm_tran_time,norm_fam_inc) ## 1)合并标准化后的数据,用data。frame的公式
good_obs_data_use <- complete.cases(dat_use_prelim_2,commute_f) ## 第二次嵌套,两组变量数据框与 划分分类数据框的嵌套 ==(完整数据)
dat_use <-subset(dat_use_prelim_2,good_obs_data_use) ## 第三次: 两组标准化后的数据框与 完整数据框的嵌套
y_use <-subset(commute_f,good_obs_data_use) ## 第四次嵌套:划分分类 与完整数据框的嵌套
## 第三步: 设置train data 和 test data。
set.seed(12345)
NN_obs <- sum( good_obs_data_use== 1) ## 用函数sum 来获取观察对象(注意:数量和 标准化)
select1 <- (runif(NN_obs)< 0.7) ## select1 设置为train data
train_data_2 <- subset(dat_use, select1)
test_data_2 <- subset(dat_use,(!select1))
cl_data <-y_use[select1]
true_data <- y_use[!select1]
summary(cl_data)
## bus rail subway car other
## 2059 346 9702 6445 3540
prop.table(summary(cl_data))
## bus rail subway car other
## 0.09320116 0.01566178 0.43916350 0.29173456 0.16023900
summary(train_data_2)
## norm_tran_time norm_fam_inc
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.5781 1st Qu.:0.9377
## Median :0.6624 Median :0.9627
## Mean :0.6581 Mean :0.9469
## 3rd Qu.:0.7468 3rd Qu.:0.9790
## Max. :0.9916 Max. :1.0000
## 第四步:运行knn test
require(class)
## Loading required package: class
for(indx in seq(1,9, by=2)) {
pre_commute <-knn(train_data_2, test_data_2, cl_data, k= indx, l=0, prob=F, use.all=T)
num_correct_labels <-sum(pre_commute==y_use[!select1])
correct_rate <-(num_correct_labels/length(true_data))*100
print(c(indx,correct_rate))
}
## [1] 1.00000 47.28699
## [1] 3.0000 49.7475
## [1] 5.00000 50.58558
## [1] 7.00000 51.27324
## [1] 9.00000 51.61706
## q1: for 函数这里还有疑惑。
## pre_commute
print(summary(pre_commute))
## bus rail subway car other
## 102 4 5803 2273 1125
Second Run
dat_NYC <- subset(acs2017_ny, (acs2017_ny$in_NYC == 1)&(acs2017_ny$AGE > 18) & (acs2017_ny$AGE < 66))
attach(dat_NYC)
## The following objects are masked _by_ .GlobalEnv:
##
## Commute_subway, OWNCOST, TRANWORK
## The following objects are masked from dat_NYC (pos = 4):
##
## AfAm, AGE, Amindian, ANCESTR1, ANCESTR1D, ANCESTR2, ANCESTR2D,
## Asian, below_150poverty, below_200poverty, below_povertyline, BPL,
## BPLD, BUILTYR2, CITIZEN, CLASSWKR, CLASSWKRD, Commute_bus,
## Commute_car, Commute_other, Commute_rail, Commute_subway, COSTELEC,
## COSTFUEL, COSTGAS, COSTWATR, DEGFIELD, DEGFIELD2, DEGFIELD2D,
## DEGFIELDD, DEPARTS, EDUC, educ_advdeg, educ_college, educ_hs,
## educ_nohs, educ_somecoll, EDUCD, EMPSTAT, EMPSTATD, FAMSIZE,
## female, foodstamps, FOODSTMP, FTOTINC, FUELHEAT, GQ,
## has_AnyHealthIns, has_PvtHealthIns, HCOVANY, HCOVPRIV, HHINCOME,
## Hisp_Cuban, Hisp_DomR, Hisp_Mex, Hisp_PR, HISPAN, HISPAND,
## Hispanic, in_Bronx, in_Brooklyn, in_Manhattan, in_Nassau, in_NYC,
## in_Queens, in_StatenI, in_Westchester, INCTOT, INCWAGE, IND,
## LABFORCE, LINGISOL, MARST, MIGCOUNTY1, MIGPLAC1, MIGPUMA1,
## MIGRATE1, MIGRATE1D, MORTGAGE, NCHILD, NCHLT5, OCC, OWNCOST,
## OWNERSHP, OWNERSHPD, POVERTY, PUMA, PWPUMA00, RACE, race_oth,
## RACED, RELATE, RELATED, RENT, ROOMS, SCHOOL, SEX, SSMC, TRANTIME,
## TRANWORK, UHRSWORK, UNITSSTR, unmarried, veteran, VETSTAT,
## VETSTATD, white, WKSWORK2, YRSUSA1
## The following objects are masked from dat_use_1:
##
## AfAm, AGE, Amindian, ANCESTR1, ANCESTR1D, ANCESTR2, ANCESTR2D,
## Asian, below_150poverty, below_200poverty, below_povertyline, BPL,
## BPLD, BUILTYR2, CITIZEN, CLASSWKR, CLASSWKRD, Commute_bus,
## Commute_car, Commute_other, Commute_rail, Commute_subway, COSTELEC,
## COSTFUEL, COSTGAS, COSTWATR, DEGFIELD, DEGFIELD2, DEGFIELD2D,
## DEGFIELDD, DEPARTS, EDUC, educ_advdeg, educ_college, educ_hs,
## educ_nohs, educ_somecoll, EDUCD, EMPSTAT, EMPSTATD, FAMSIZE,
## female, foodstamps, FOODSTMP, FTOTINC, FUELHEAT, GQ,
## has_AnyHealthIns, has_PvtHealthIns, HCOVANY, HCOVPRIV, HHINCOME,
## Hisp_Cuban, Hisp_DomR, Hisp_Mex, Hisp_PR, HISPAN, HISPAND,
## Hispanic, in_Bronx, in_Brooklyn, in_Manhattan, in_Nassau, in_NYC,
## in_Queens, in_StatenI, in_Westchester, INCTOT, INCWAGE, IND,
## LABFORCE, LINGISOL, MARST, MIGCOUNTY1, MIGPLAC1, MIGPUMA1,
## MIGRATE1, MIGRATE1D, MORTGAGE, NCHILD, NCHLT5, OCC, OWNCOST,
## OWNERSHP, OWNERSHPD, POVERTY, PUMA, PWPUMA00, RACE, race_oth,
## RACED, RELATE, RELATED, RENT, ROOMS, SCHOOL, SEX, SSMC, TRANTIME,
## TRANWORK, UHRSWORK, UNITSSTR, unmarried, veteran, VETSTAT,
## VETSTATD, white, WKSWORK2, YRSUSA1
## The following objects are masked from dat_use:
##
## AfAm, AGE, Amindian, ANCESTR1, ANCESTR1D, ANCESTR2, ANCESTR2D,
## Asian, below_150poverty, below_200poverty, below_povertyline, BPL,
## BPLD, BUILTYR2, CITIZEN, CLASSWKR, CLASSWKRD, Commute_bus,
## Commute_car, Commute_other, Commute_rail, Commute_subway, COSTELEC,
## COSTFUEL, COSTGAS, COSTWATR, DEGFIELD, DEGFIELD2, DEGFIELD2D,
## DEGFIELDD, DEPARTS, EDUC, educ_advdeg, educ_college, educ_hs,
## educ_nohs, educ_somecoll, EDUCD, EMPSTAT, EMPSTATD, FAMSIZE,
## female, foodstamps, FOODSTMP, FTOTINC, FUELHEAT, GQ,
## has_AnyHealthIns, has_PvtHealthIns, HCOVANY, HCOVPRIV, HHINCOME,
## Hisp_Cuban, Hisp_DomR, Hisp_Mex, Hisp_PR, HISPAN, HISPAND,
## Hispanic, in_Bronx, in_Brooklyn, in_Manhattan, in_Nassau, in_NYC,
## in_Queens, in_StatenI, in_Westchester, INCTOT, INCWAGE, IND,
## LABFORCE, LINGISOL, MARST, MIGCOUNTY1, MIGPLAC1, MIGPUMA1,
## MIGRATE1, MIGRATE1D, MORTGAGE, NCHILD, NCHLT5, OCC, OWNCOST,
## OWNERSHP, OWNERSHPD, POVERTY, PUMA, PWPUMA00, RACE, race_oth,
## RACED, RELATE, RELATED, RENT, ROOMS, SCHOOL, SEX, SSMC, TRANTIME,
## TRANWORK, UHRSWORK, UNITSSTR, unmarried, veteran, VETSTAT,
## VETSTATD, white, WKSWORK2, YRSUSA1
commute_f <- factor((dat_NYC$Commute_bus+2*dat_NYC$Commute_rail+3*dat_NYC$Commute_subway+4*dat_NYC$Commute_car+5*dat_NYC$Commute_other),levels = c(1,2,3,4,5), labels = c("bus","rail","subway","car","other"))
norm_varb <- function(X_in) {
(max(X_in, na.rm = TRUE) - X_in)/( max(X_in, na.rm = TRUE) - min(X_in, na.rm = TRUE) )
}
is.na(TRANWORK)<-which(TRANWORK==0)
tran_fambackground <- FAMSIZE+COSTELEC+COSTGAS+COSTWATR+COSTFUEL
norm_tran_fambackground <-norm_varb(tran_fambackground)
fam_wage <-INCWAGE
norm_fam_poverty<-norm_varb(fam_wage)
dat_use_prelim_2 <-data.frame(norm_tran_fambackground,fam_wage)
good_obs_data_use <- complete.cases(dat_use_prelim_2,commute_f)
dat_use <-subset(dat_use_prelim_2,good_obs_data_use)
y_use <-subset(commute_f,good_obs_data_use)
set.seed(12345)
NN_obs <- sum( good_obs_data_use== 1)
select1 <- (runif(NN_obs)< 0.7)
train_data_2 <- subset(dat_use, select1)
test_data_2 <- subset(dat_use,(!select1))
cl_data <-y_use[select1]
true_data <- y_use[!select1]
summary(cl_data)
## bus rail subway car other
## 2059 346 9702 6445 3540
prop.table(summary(cl_data))
## bus rail subway car other
## 0.09320116 0.01566178 0.43916350 0.29173456 0.16023900
summary(train_data_2)
## norm_tran_fambackground fam_wage
## Min. :0.0000 Min. : 0
## 1st Qu.:0.2234 1st Qu.: 19000
## Median :0.4463 Median : 40000
## Mean :0.4034 Mean : 60695
## 3rd Qu.:0.5775 3rd Qu.: 75000
## Max. :1.0000 Max. :638000
require(class)
for(indx in seq(1,9, by=2)) {
pre_commute <-knn(train_data_2, test_data_2, cl_data, k= indx, l=0, prob=F, use.all=T)
num_correct_labels <-sum(pre_commute==y_use[!select1])
correct_rate <-(num_correct_labels/length(true_data))*100
print(c(indx,correct_rate))
}
## [1] 1.00000 35.96218
## [1] 3.00000 38.15408
## [1] 5.00000 40.42119
## [1] 7.0000 41.9899
## [1] 9.00000 43.16106
print(summary(pre_commute))
## bus rail subway car other
## 197 1 5846 2582 681
```