rm(list=ls())
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
census <- read.csv("/Users/maxineharlemon/AIOpt/Census_2023_Project.csv")
str(census)
## 'data.frame': 56198 obs. of 287 variables:
## $ RT : chr "P" "P" "P" "P" ...
## $ SERIALNO : chr "2023GQ0000242" "2023GQ0000286" "2023GQ0000365" "2023GQ0000417" ...
## $ DIVISION : int 5 5 5 5 5 5 5 5 5 5 ...
## $ SPORDER : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PUMA : int 1402 1901 1800 2702 305 1200 800 1200 302 304 ...
## $ REGION : int 3 3 3 3 3 3 3 3 3 3 ...
## $ STATE : int 45 45 45 45 45 45 45 45 45 45 ...
## $ ADJINC : int 1019518 1019518 1019518 1019518 1019518 1019518 1019518 1019518 1019518 1019518 ...
## $ PWGTP : int 32 52 85 61 5 116 51 25 84 3 ...
## $ AGEP : int 54 27 72 19 13 57 21 72 40 78 ...
## $ CIT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ CITWP : int NA NA NA NA NA NA NA NA NA NA ...
## $ COW : int NA 5 NA 1 NA NA 2 NA NA NA ...
## $ DDRS : int 2 2 1 2 2 2 2 1 1 1 ...
## $ DEAR : int 2 2 2 2 2 2 2 2 2 1 ...
## $ DEYE : int 2 2 2 2 2 2 2 2 2 2 ...
## $ DOUT : int 1 2 1 2 NA 2 2 1 1 1 ...
## $ DPHY : int 2 2 1 2 2 1 2 1 1 1 ...
## $ DRAT : int NA NA NA NA NA NA NA NA NA NA ...
## $ DRATX : int NA 2 NA NA NA NA NA NA NA NA ...
## $ DREM : int 1 2 1 2 2 2 2 1 2 1 ...
## $ ENG : int NA 1 NA NA NA 1 NA NA NA NA ...
## $ FER : int NA NA NA 2 NA NA NA NA NA NA ...
## $ GCL : int 2 NA 2 NA NA 2 NA 2 2 2 ...
## $ GCM : int NA NA NA NA NA NA NA NA NA NA ...
## $ GCR : int NA NA NA NA NA NA NA NA NA NA ...
## $ HIMRKS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HINS1 : int 2 2 2 1 2 2 2 2 2 2 ...
## $ HINS2 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ HINS3 : int 1 2 1 2 2 2 2 1 2 1 ...
## $ HINS4 : int 1 2 1 2 1 1 1 1 1 1 ...
## $ HINS5 : int 2 1 2 2 2 2 2 2 2 2 ...
## $ HINS6 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ HINS7 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ INTP : int 0 0 0 0 NA 0 0 0 0 0 ...
## $ JWMNP : int NA 5 NA 5 NA NA NA NA NA NA ...
## $ JWRIP : int NA 3 NA NA NA NA NA NA NA NA ...
## $ JWTRNS : int NA 1 NA 10 NA NA NA NA NA NA ...
## $ LANX : int 2 1 2 2 2 1 2 2 2 2 ...
## $ MAR : int 5 5 1 5 5 5 5 3 5 5 ...
## $ MARHD : int NA NA 2 NA NA NA NA 2 NA NA ...
## $ MARHM : int NA NA 2 NA NA NA NA 2 NA NA ...
## $ MARHT : int NA NA 2 NA NA NA NA 1 NA NA ...
## $ MARHW : int NA NA 2 NA NA NA NA 2 NA NA ...
## $ MARHYP : int NA NA 1994 NA NA NA NA 1994 NA NA ...
## $ MIG : int 1 3 1 3 3 1 1 1 1 1 ...
## $ MIL : int 4 1 4 4 NA 4 4 4 4 4 ...
## $ MLPA : int NA 1 NA NA NA NA NA NA NA NA ...
## $ MLPB : int NA 0 NA NA NA NA NA NA NA NA ...
## $ MLPCD : int NA 0 NA NA NA NA NA NA NA NA ...
## $ MLPE : int NA 0 NA NA NA NA NA NA NA NA ...
## $ MLPFG : int NA 0 NA NA NA NA NA NA NA NA ...
## $ MLPH : int NA 0 NA NA NA NA NA NA NA NA ...
## $ MLPIK : int NA 0 NA NA NA NA NA NA NA NA ...
## $ MLPJ : int NA 0 NA NA NA NA NA NA NA NA ...
## $ NWAB : int 2 3 3 3 NA 2 3 3 3 2 ...
## $ NWAV : int 5 5 5 5 NA 5 5 5 5 5 ...
## $ NWLA : int 2 3 3 3 NA 2 3 3 3 2 ...
## $ NWLK : int 2 3 3 3 NA 2 3 3 3 2 ...
## $ NWRE : int 3 3 3 3 NA 3 3 3 3 3 ...
## $ OIP : int 0 0 0 0 NA 0 0 0 0 0 ...
## $ PAP : int 0 0 0 0 NA 0 0 0 0 0 ...
## $ RELSHIPP : int 38 38 37 38 37 38 38 37 37 38 ...
## $ RETP : int 0 0 0 0 NA 0 0 0 0 0 ...
## $ SCH : int 1 1 1 2 2 1 2 1 1 1 ...
## $ SCHG : int NA NA NA 15 10 NA 15 NA NA NA ...
## $ SCHL : int 16 16 16 18 10 19 19 21 19 17 ...
## $ SEMP : int 0 0 0 0 NA 0 0 0 0 0 ...
## $ SEX : int 1 1 1 2 1 2 1 2 1 2 ...
## $ SSIP : int 2200 0 0 0 NA 0 0 0 0 0 ...
## $ SSP : int 0 0 4300 0 NA 10100 0 19200 2700 6600 ...
## $ WAGP : int 0 24000 0 10000 NA 0 5400 0 0 0 ...
## $ WKHP : int NA 40 NA 15 NA NA 34 NA NA NA ...
## $ WKL : int 3 1 3 1 NA 3 1 3 3 3 ...
## $ WKWN : int NA 52 NA 52 NA NA 23 NA NA NA ...
## $ WRK : int 2 1 NA 1 NA 2 NA NA NA 2 ...
## $ YOEP : int NA NA NA NA NA NA NA NA NA NA ...
## $ ANC : int 4 1 1 4 4 1 1 2 4 4 ...
## $ ANC1P : int 999 210 902 999 999 902 902 599 999 999 ...
## $ ANC2P : int 999 999 999 999 999 999 999 920 999 999 ...
## $ DECADE : int NA NA NA NA NA NA NA NA NA NA ...
## $ DIS : int 1 2 1 2 2 1 2 1 1 1 ...
## $ DRIVESP : int NA 3 NA NA NA NA NA NA NA NA ...
## $ ESP : int NA NA NA NA NA NA NA NA NA NA ...
## $ ESR : int 6 4 6 1 NA 6 2 6 6 6 ...
## $ FOD1P : int NA NA NA NA NA NA NA 5003 NA NA ...
## $ FOD2P : int NA NA NA NA NA NA NA NA NA NA ...
## $ HICOV : int 1 1 1 1 1 1 1 1 1 1 ...
## $ HISP : int 1 2 1 1 1 1 1 1 1 1 ...
## $ INDP : int NA 9770 NA 7870 NA NA 4971 NA NA NA ...
## $ JWAP : int NA 51 NA 189 NA NA NA NA NA NA ...
## $ JWDP : int NA 15 NA 114 NA NA NA NA NA NA ...
## $ LANP : int NA 1200 NA NA NA 1200 NA NA NA NA ...
## $ MIGPUMA : int NA 4200 NA 1200 1490 NA NA NA NA NA ...
## $ MIGSP : int NA 36 NA 24 45 NA NA NA NA NA ...
## $ MSP : int 6 6 2 6 NA 6 6 4 6 6 ...
## $ NAICSP : chr "" "92811P4" "" "611M1" ...
## $ NATIVITY : int 1 1 1 1 1 1 1 1 1 1 ...
## $ NOP : int NA NA NA NA NA NA NA NA NA NA ...
## [list output truncated]
keep <- c( "AGEP", "COW", "ESR", "ESP", "DRIVESP", "MAR", "OCCP", "RELSHIPP", "RAC1P", "SEX", "FWKHP", "WKHP", "PINCP")
psam_p45_small <- census <- census %>% select(all_of(keep))
write.csv(psam_p45_small, file = "psam_p45_small.csv")
dim(census)
## [1] 56198 13
census <- census %>%
rename(age = AGEP, workclass= COW, emp=ESR, employ_status_parent =ESP, vehicle_no =DRIVESP, marital_status = MAR, occupation = OCCP, relationship = RELSHIPP, race= RAC1P, sex= SEX, usual_hour_worked = FWKHP, hours_work=WKHP, total_income = PINCP)
head(census)
## age workclass emp employ_status_parent vehicle_no marital_status occupation
## 1 54 NA 6 NA NA 5 NA
## 2 27 5 4 NA 3 5 8600
## 3 72 NA 6 NA NA 1 NA
## 4 19 1 1 NA NA 5 4420
## 5 13 NA NA NA NA 5 NA
## 6 57 NA 6 NA NA 5 NA
## relationship race sex usual_hour_worked hours_work total_income
## 1 38 1 1 0 NA 2200
## 2 38 8 1 0 40 24000
## 3 37 2 1 0 NA 4300
## 4 38 1 2 0 15 10000
## 5 37 1 1 0 NA NA
## 6 38 2 2 0 NA 10100
#dim(census)
#str(census)
## delete row if income is NA
inc_na <- is.na(census$total_income)
census <- census[!inc_na, ]
dim(census)
## [1] 47965 13
## change income values to categorical
census$total_income <- ifelse(census$total_income >= 50000, ">=50K", "< 50K")
census$total_income <- as.factor(census$total_income)
#table(census$total_income)
#View(census)
census$sex =factor(census$sex,levels=c(1,2),labels= c("Male","Female"))
str(census$sex)
## Factor w/ 2 levels "Male","Female": 1 1 1 2 2 1 2 1 2 2 ...
table(census$sex)
##
## Male Female
## 22914 25051
census$race <- factor(census$race,levels = c(1,2,3,4,5,6,7,8,9),labels=c("White","Black or African American","American Indian","Alaska Native", "American Indian and Alaska Native","Asian", "Native Hawaiian and Other Pacific Islander", "Some Other Race alone", "Two or More Races" ))
table(census$race)
##
## White
## 33989
## Black or African American
## 9169
## American Indian
## 255
## Alaska Native
## 0
## American Indian and Alaska Native
## 63
## Asian
## 757
## Native Hawaiian and Other Pacific Islander
## 42
## Some Other Race alone
## 998
## Two or More Races
## 2692
census$workclass <- factor(census$workclass,levels = c(0,1,2,3,4,5,6,7,8,9),labels=c("< 16","Private for profit","Private non profit","Local Government","State Government", "Federal Government","Self Employed", "Self Employed Corporation", "Family Business/Farm", "Unemployed" ))
table(census$workclass)
##
## < 16 Private for profit Private non profit
## 0 21195 2391
## Local Government State Government Federal Government
## 2370 1748 1214
## Self Employed Self Employed Corporation Family Business/Farm
## 2122 1438 109
## Unemployed
## 169
census$emp <- factor(census$emp, levels = c(0,1,2,3,4,5,6),labels=c("<16","Civilian employed","Civilian employed/not at work","Unemployed","Armed Forces/at work", "Armed Forces/not at work","Not in workforce" ))
table(census$emp)
##
## <16 Civilian employed
## 0 24172
## Civilian employed/not at work Unemployed
## 472 1007
## Armed Forces/at work Armed Forces/not at work
## 393 3
## Not in workforce
## 21257
census$employ_status_parent <- factor(census$employ_status_parent, levels=c(0,1,2,3,4,5,6,7,8),labels=c("NA","2 Parent both Work","2 Parent Father work"," 2parent Mother work","2 Parent Neither work", "Father Father work","Father Father Unemployed", "Mother Mother works", "Mother Mother unemployed"))
table(census$employ_status_parent)
##
## NA 2 Parent both Work 2 Parent Father work
## 0 811 257
## 2parent Mother work 2 Parent Neither work Father Father work
## 63 20 99
## Father Father Unemployed Mother Mother works Mother Mother unemployed
## 26 373 67
census$vehicle_no <- factor(census$vehicle_no,levels = c(0,1,2,3,4,5,6),labels=c("NA","Drive Alone","2 Person Carpool","3 Person Carpool","4 Person Carpool", "5 6 Person Carpool","7 or more Carpool"))
table(census$vehicle_no)
##
## NA Drive Alone 2 Person Carpool 3 Person Carpool
## 0 18621 1572 303
## 4 Person Carpool 5 6 Person Carpool 7 or more Carpool
## 120 63 29
census$marital_status <- factor(census$marital_status, levels = c(1,2,3,4,5),labels=c("Married","Widowed","Divorced","Separated", "Never Married under 15" ))
table(census$marital_status)
##
## Married Widowed Divorced
## 24871 3644 4910
## Separated Never Married under 15
## 925 13615
census$relationship <- factor( census$relationship,
levels = c(20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38),
labels = c("Reference person", "husband/wife", "unmarried partner", "Same-sex husband/wife",
"Same-sex unmarried partner", "son or daughter", "Adopted son or daughter", "Stepson or stepdaughter",
"Brother or sister", "Father or mother", "Grandchild", "Parent-in-law", "Son-in-law or daughter-in-law",
"Other relative", "Roommate or housemate", "Foster child", "Other nonrelative", "Institutionalized group quarters population", "Noninstitutionalized group quarters population"))
census$occupation <- factor(census$occupation)
str(census)
## 'data.frame': 47965 obs. of 13 variables:
## $ age : int 54 27 72 19 57 21 72 40 78 20 ...
## $ workclass : Factor w/ 10 levels "< 16","Private for profit",..: NA 6 NA 2 NA 3 NA NA NA 2 ...
## $ emp : Factor w/ 7 levels "<16","Civilian employed",..: 7 5 7 2 7 3 7 7 7 7 ...
## $ employ_status_parent: Factor w/ 9 levels "NA","2 Parent both Work",..: NA NA NA NA NA NA NA NA NA NA ...
## $ vehicle_no : Factor w/ 7 levels "NA","Drive Alone",..: NA 4 NA NA NA NA NA NA NA NA ...
## $ marital_status : Factor w/ 5 levels "Married","Widowed",..: 5 5 1 5 5 5 3 5 5 5 ...
## $ occupation : Factor w/ 524 levels "10","20","40",..: NA 468 NA 277 NA 324 NA NA NA 295 ...
## $ relationship : Factor w/ 19 levels "Reference person",..: 19 19 18 19 19 19 18 18 19 19 ...
## $ race : Factor w/ 9 levels "White","Black or African American",..: 1 8 2 1 2 2 9 1 1 2 ...
## $ sex : Factor w/ 2 levels "Male","Female": 1 1 1 2 2 1 2 1 2 2 ...
## $ usual_hour_worked : int 0 0 0 0 0 1 0 0 0 0 ...
## $ hours_work : int NA 40 NA 15 NA 34 NA NA NA 30 ...
## $ total_income : Factor w/ 2 levels "< 50K",">=50K": 1 1 1 1 1 1 1 1 1 1 ...
summary(census)
## age workclass
## Min. :15.00 Private for profit:21195
## 1st Qu.:33.00 Private non profit: 2391
## Median :52.00 Local Government : 2370
## Mean :50.44 Self Employed : 2122
## 3rd Qu.:67.00 State Government : 1748
## Max. :93.00 (Other) : 2930
## NA's :15209
## emp employ_status_parent
## Civilian employed :24172 2 Parent both Work : 811
## Not in workforce :21257 Mother Mother works : 373
## Unemployed : 1007 2 Parent Father work : 257
## Civilian employed/not at work: 472 Father Father work : 99
## Armed Forces/at work : 393 Mother Mother unemployed: 67
## (Other) : 3 (Other) : 109
## NA's : 661 NA's :46249
## vehicle_no marital_status occupation
## Drive Alone :18621 Married :24871 440 : 875
## 2 Person Carpool : 1572 Widowed : 3644 4720 : 799
## 3 Person Carpool : 303 Divorced : 4910 2310 : 714
## 4 Person Carpool : 120 Separated : 925 3255 : 701
## 5 6 Person Carpool: 63 Never Married under 15:13615 9130 : 690
## (Other) : 29 (Other):28977
## NA's :27257 NA's :15209
## relationship
## Reference person :22891
## husband/wife :11614
## son or daughter : 5187
## Noninstitutionalized group quarters population: 1824
## Institutionalized group quarters population : 1362
## unmarried partner : 1121
## (Other) : 3966
## race sex usual_hour_worked
## White :33989 Male :22914 Min. :0.0000
## Black or African American: 9169 Female:25051 1st Qu.:0.0000
## Two or More Races : 2692 Median :0.0000
## Some Other Race alone : 998 Mean :0.1021
## Asian : 757 3rd Qu.:0.0000
## American Indian : 255 Max. :1.0000
## (Other) : 105
## hours_work total_income
## Min. : 1.00 < 50K:33039
## 1st Qu.:34.00 >=50K:14926
## Median :40.00
## Mean :37.79
## 3rd Qu.:40.00
## Max. :99.00
## NA's :20112
library(mdsr)
library(dplyr)
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
##
## mean
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
set.seed(364)
n <- nrow(census)
test_idx <- sample.int(n, size = round(0.2 * n))
train <- census[-test_idx, ]
nrow(train)
## [1] 38372
test <- census[test_idx, ]
nrow(test)
## [1] 9593
tally(~total_income, data = train, format = "percent")
## total_income
## < 50K >=50K
## 68.93568 31.06432
library(magrittr)
library(dplyr)
library(ggplot2)
library(rpart)
rpart.control(minsplit = 2, minbucket = 1, cp = 0.001)
## $minsplit
## [1] 2
##
## $minbucket
## [1] 1
##
## $cp
## [1] 0.001
##
## $maxcompete
## [1] 4
##
## $maxsurrogate
## [1] 5
##
## $usesurrogate
## [1] 2
##
## $surrogatestyle
## [1] 0
##
## $maxdepth
## [1] 30
##
## $xval
## [1] 10
rpart(total_income ~ usual_hour_worked, data = train, control = rpart.control())
## n= 38372
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 38372 11920 < 50K (0.6893568 0.3106432) *
dat_train <- as.formula("total_income ~ age + workclass + emp + employ_status_parent + vehicle_no + race + sex + usual_hour_worked + hours_work ")
model_tree <- rpart(dat_train, data = train)
model_tree
## n= 38372
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 38372 11920 < 50K (0.6893568 0.3106432)
## 2) emp=Civilian employed/not at work,Unemployed,Armed Forces/not at work,Not in workforce 18208 2621 < 50K (0.8560523 0.1439477) *
## 3) emp=Civilian employed,Armed Forces/at work 20164 9299 < 50K (0.5388316 0.4611684)
## 6) age< 24.5 3072 173 < 50K (0.9436849 0.0563151) *
## 7) age>=24.5 17092 7966 >=50K (0.4660660 0.5339340)
## 14) hours_work< 39.5 4028 1099 < 50K (0.7271599 0.2728401) *
## 15) hours_work>=39.5 13064 5037 >=50K (0.3855634 0.6144366)
## 30) race=Black or African American,American Indian,American Indian and Alaska Native,Some Other Race alone 2485 1024 < 50K (0.5879276 0.4120724) *
## 31) race=White,Asian,Native Hawaiian and Other Pacific Islander,Two or More Races 10579 3576 >=50K (0.3380282 0.6619718) *
plot(model_tree)
text(model_tree, use.n = TRUE, all= TRUE, cex = 0.5)

## Check Decision Tree Classifier (3)
library(rpart.plot)
rpart.plot(model_tree)

library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
plot(as.party(model_tree))

#complexity parameter (cp) table with relative error and cross-validation error
printcp(model_tree)
##
## Classification tree:
## rpart(formula = dat_train, data = train)
##
## Variables actually used in tree construction:
## [1] age emp hours_work race
##
## Root node error: 11920/38372 = 0.31064
##
## n= 38372
##
## CP nsplit rel error xerror xstd
## 1 0.083613 0 1.00000 1.00000 0.0076047
## 2 0.036661 3 0.74916 0.74916 0.0069443
## 3 0.010000 4 0.71250 0.71435 0.0068286
#Confusion matrix
train <- train %>%
mutate(income_tree = predict(model_tree, type = "class"))
confusion <- tally(income_tree ~ total_income, data = train, format = "count")
confusion
## total_income
## income_tree < 50K >=50K
## < 50K 22876 4917
## >=50K 3576 7003
accuracy <- sum(diag(confusion)) / nrow(train)
accuracy
## [1] 0.7786667
#Prediction in test dataset
# Confusion matrix
test <- test %>%
mutate(income_tree_test = predict(model_tree, type = "class", newdata = test))
confusion_test <- tally(income_tree_test ~ total_income, data = test, format = "count")
confusion_test
## total_income
## income_tree_test < 50K >=50K
## < 50K 5742 1235
## >=50K 845 1771
accuracy_test <- sum(diag(confusion_test)) / nrow(test)
accuracy_test
## [1] 0.7831752
# TUNING PARAMETERS
model_tree2 <- rpart(dat_train, data = test, control = rpart.control(cp = 0.002))
test <- test %>%
mutate(income_tree2 = predict(model_tree2, type = "class"))
confusion_test_tune <- tally(income_tree2 ~ total_income, data = test, format = "count")
confusion_test_tune
## total_income
## income_tree2 < 50K >=50K
## < 50K 5650 1102
## >=50K 937 1904
accuracy_test_tune <- sum(diag(confusion_test_tune)) / nrow(test)
accuracy_test_tune
## [1] 0.7874492