US CENSUS DATA

rm(list=ls())
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

census <- read.csv("/Users/maxineharlemon/AIOpt/Census_2023_Project.csv")
str(census)

## 'data.frame':    56198 obs. of  287 variables:
##  $ RT       : chr  "P" "P" "P" "P" ...
##  $ SERIALNO : chr  "2023GQ0000242" "2023GQ0000286" "2023GQ0000365" "2023GQ0000417" ...
##  $ DIVISION : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ SPORDER  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ PUMA     : int  1402 1901 1800 2702 305 1200 800 1200 302 304 ...
##  $ REGION   : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ STATE    : int  45 45 45 45 45 45 45 45 45 45 ...
##  $ ADJINC   : int  1019518 1019518 1019518 1019518 1019518 1019518 1019518 1019518 1019518 1019518 ...
##  $ PWGTP    : int  32 52 85 61 5 116 51 25 84 3 ...
##  $ AGEP     : int  54 27 72 19 13 57 21 72 40 78 ...
##  $ CIT      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ CITWP    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ COW      : int  NA 5 NA 1 NA NA 2 NA NA NA ...
##  $ DDRS     : int  2 2 1 2 2 2 2 1 1 1 ...
##  $ DEAR     : int  2 2 2 2 2 2 2 2 2 1 ...
##  $ DEYE     : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ DOUT     : int  1 2 1 2 NA 2 2 1 1 1 ...
##  $ DPHY     : int  2 2 1 2 2 1 2 1 1 1 ...
##  $ DRAT     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ DRATX    : int  NA 2 NA NA NA NA NA NA NA NA ...
##  $ DREM     : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ ENG      : int  NA 1 NA NA NA 1 NA NA NA NA ...
##  $ FER      : int  NA NA NA 2 NA NA NA NA NA NA ...
##  $ GCL      : int  2 NA 2 NA NA 2 NA 2 2 2 ...
##  $ GCM      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ GCR      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ HIMRKS   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HINS1    : int  2 2 2 1 2 2 2 2 2 2 ...
##  $ HINS2    : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ HINS3    : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ HINS4    : int  1 2 1 2 1 1 1 1 1 1 ...
##  $ HINS5    : int  2 1 2 2 2 2 2 2 2 2 ...
##  $ HINS6    : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ HINS7    : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ INTP     : int  0 0 0 0 NA 0 0 0 0 0 ...
##  $ JWMNP    : int  NA 5 NA 5 NA NA NA NA NA NA ...
##  $ JWRIP    : int  NA 3 NA NA NA NA NA NA NA NA ...
##  $ JWTRNS   : int  NA 1 NA 10 NA NA NA NA NA NA ...
##  $ LANX     : int  2 1 2 2 2 1 2 2 2 2 ...
##  $ MAR      : int  5 5 1 5 5 5 5 3 5 5 ...
##  $ MARHD    : int  NA NA 2 NA NA NA NA 2 NA NA ...
##  $ MARHM    : int  NA NA 2 NA NA NA NA 2 NA NA ...
##  $ MARHT    : int  NA NA 2 NA NA NA NA 1 NA NA ...
##  $ MARHW    : int  NA NA 2 NA NA NA NA 2 NA NA ...
##  $ MARHYP   : int  NA NA 1994 NA NA NA NA 1994 NA NA ...
##  $ MIG      : int  1 3 1 3 3 1 1 1 1 1 ...
##  $ MIL      : int  4 1 4 4 NA 4 4 4 4 4 ...
##  $ MLPA     : int  NA 1 NA NA NA NA NA NA NA NA ...
##  $ MLPB     : int  NA 0 NA NA NA NA NA NA NA NA ...
##  $ MLPCD    : int  NA 0 NA NA NA NA NA NA NA NA ...
##  $ MLPE     : int  NA 0 NA NA NA NA NA NA NA NA ...
##  $ MLPFG    : int  NA 0 NA NA NA NA NA NA NA NA ...
##  $ MLPH     : int  NA 0 NA NA NA NA NA NA NA NA ...
##  $ MLPIK    : int  NA 0 NA NA NA NA NA NA NA NA ...
##  $ MLPJ     : int  NA 0 NA NA NA NA NA NA NA NA ...
##  $ NWAB     : int  2 3 3 3 NA 2 3 3 3 2 ...
##  $ NWAV     : int  5 5 5 5 NA 5 5 5 5 5 ...
##  $ NWLA     : int  2 3 3 3 NA 2 3 3 3 2 ...
##  $ NWLK     : int  2 3 3 3 NA 2 3 3 3 2 ...
##  $ NWRE     : int  3 3 3 3 NA 3 3 3 3 3 ...
##  $ OIP      : int  0 0 0 0 NA 0 0 0 0 0 ...
##  $ PAP      : int  0 0 0 0 NA 0 0 0 0 0 ...
##  $ RELSHIPP : int  38 38 37 38 37 38 38 37 37 38 ...
##  $ RETP     : int  0 0 0 0 NA 0 0 0 0 0 ...
##  $ SCH      : int  1 1 1 2 2 1 2 1 1 1 ...
##  $ SCHG     : int  NA NA NA 15 10 NA 15 NA NA NA ...
##  $ SCHL     : int  16 16 16 18 10 19 19 21 19 17 ...
##  $ SEMP     : int  0 0 0 0 NA 0 0 0 0 0 ...
##  $ SEX      : int  1 1 1 2 1 2 1 2 1 2 ...
##  $ SSIP     : int  2200 0 0 0 NA 0 0 0 0 0 ...
##  $ SSP      : int  0 0 4300 0 NA 10100 0 19200 2700 6600 ...
##  $ WAGP     : int  0 24000 0 10000 NA 0 5400 0 0 0 ...
##  $ WKHP     : int  NA 40 NA 15 NA NA 34 NA NA NA ...
##  $ WKL      : int  3 1 3 1 NA 3 1 3 3 3 ...
##  $ WKWN     : int  NA 52 NA 52 NA NA 23 NA NA NA ...
##  $ WRK      : int  2 1 NA 1 NA 2 NA NA NA 2 ...
##  $ YOEP     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ ANC      : int  4 1 1 4 4 1 1 2 4 4 ...
##  $ ANC1P    : int  999 210 902 999 999 902 902 599 999 999 ...
##  $ ANC2P    : int  999 999 999 999 999 999 999 920 999 999 ...
##  $ DECADE   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ DIS      : int  1 2 1 2 2 1 2 1 1 1 ...
##  $ DRIVESP  : int  NA 3 NA NA NA NA NA NA NA NA ...
##  $ ESP      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ ESR      : int  6 4 6 1 NA 6 2 6 6 6 ...
##  $ FOD1P    : int  NA NA NA NA NA NA NA 5003 NA NA ...
##  $ FOD2P    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ HICOV    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ HISP     : int  1 2 1 1 1 1 1 1 1 1 ...
##  $ INDP     : int  NA 9770 NA 7870 NA NA 4971 NA NA NA ...
##  $ JWAP     : int  NA 51 NA 189 NA NA NA NA NA NA ...
##  $ JWDP     : int  NA 15 NA 114 NA NA NA NA NA NA ...
##  $ LANP     : int  NA 1200 NA NA NA 1200 NA NA NA NA ...
##  $ MIGPUMA  : int  NA 4200 NA 1200 1490 NA NA NA NA NA ...
##  $ MIGSP    : int  NA 36 NA 24 45 NA NA NA NA NA ...
##  $ MSP      : int  6 6 2 6 NA 6 6 4 6 6 ...
##  $ NAICSP   : chr  "" "92811P4" "" "611M1" ...
##  $ NATIVITY : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ NOP      : int  NA NA NA NA NA NA NA NA NA NA ...
##   [list output truncated]

keep <- c( "AGEP", "COW", "ESR", "ESP", "DRIVESP", "MAR", "OCCP", "RELSHIPP", "RAC1P", "SEX", "FWKHP", "WKHP", "PINCP")

psam_p45_small <- census <- census %>% select(all_of(keep))

write.csv(psam_p45_small, file = "psam_p45_small.csv")

dim(census)

## [1] 56198    13

census <- census %>% 
rename(age = AGEP, workclass= COW, emp=ESR, employ_status_parent =ESP, vehicle_no =DRIVESP, marital_status = MAR, occupation = OCCP, relationship = RELSHIPP, race= RAC1P, sex= SEX, usual_hour_worked = FWKHP,  hours_work=WKHP, total_income = PINCP)

head(census)

##   age workclass emp employ_status_parent vehicle_no marital_status occupation
## 1  54        NA   6                   NA         NA              5         NA
## 2  27         5   4                   NA          3              5       8600
## 3  72        NA   6                   NA         NA              1         NA
## 4  19         1   1                   NA         NA              5       4420
## 5  13        NA  NA                   NA         NA              5         NA
## 6  57        NA   6                   NA         NA              5         NA
##   relationship race sex usual_hour_worked hours_work total_income
## 1           38    1   1                 0         NA         2200
## 2           38    8   1                 0         40        24000
## 3           37    2   1                 0         NA         4300
## 4           38    1   2                 0         15        10000
## 5           37    1   1                 0         NA           NA
## 6           38    2   2                 0         NA        10100

#dim(census)
#str(census)

## delete row if income is NA
inc_na <-  is.na(census$total_income) 
census <- census[!inc_na, ]
dim(census)

## [1] 47965    13

## change income values to categorical 
census$total_income <- ifelse(census$total_income >= 50000, ">=50K", "< 50K")
census$total_income <- as.factor(census$total_income) 
#table(census$total_income)
#View(census)

census$sex =factor(census$sex,levels=c(1,2),labels= c("Male","Female"))
str(census$sex)

##  Factor w/ 2 levels "Male","Female": 1 1 1 2 2 1 2 1 2 2 ...

table(census$sex)

## 
##   Male Female 
##  22914  25051

census$race <- factor(census$race,levels = c(1,2,3,4,5,6,7,8,9),labels=c("White","Black or African American","American Indian","Alaska Native", "American Indian and Alaska Native","Asian", "Native Hawaiian and Other Pacific Islander", "Some Other Race alone", "Two or More Races" ))

table(census$race)

## 
##                                      White 
##                                      33989 
##                  Black or African American 
##                                       9169 
##                            American Indian 
##                                        255 
##                              Alaska Native 
##                                          0 
##          American Indian and Alaska Native 
##                                         63 
##                                      Asian 
##                                        757 
## Native Hawaiian and Other Pacific Islander 
##                                         42 
##                      Some Other Race alone 
##                                        998 
##                          Two or More Races 
##                                       2692

census$workclass <- factor(census$workclass,levels = c(0,1,2,3,4,5,6,7,8,9),labels=c("< 16","Private for profit","Private non profit","Local Government","State Government", "Federal Government","Self Employed", "Self Employed Corporation", "Family Business/Farm", "Unemployed" ))

table(census$workclass)

## 
##                      < 16        Private for profit        Private non profit 
##                         0                     21195                      2391 
##          Local Government          State Government        Federal Government 
##                      2370                      1748                      1214 
##             Self Employed Self Employed Corporation      Family Business/Farm 
##                      2122                      1438                       109 
##                Unemployed 
##                       169

census$emp <- factor(census$emp, levels = c(0,1,2,3,4,5,6),labels=c("<16","Civilian employed","Civilian employed/not at work","Unemployed","Armed Forces/at work", "Armed Forces/not at work","Not in workforce" ))

table(census$emp)

## 
##                           <16             Civilian employed 
##                             0                         24172 
## Civilian employed/not at work                    Unemployed 
##                           472                          1007 
##          Armed Forces/at work      Armed Forces/not at work 
##                           393                             3 
##              Not in workforce 
##                         21257

census$employ_status_parent <- factor(census$employ_status_parent, levels=c(0,1,2,3,4,5,6,7,8),labels=c("NA","2 Parent both Work","2 Parent Father work"," 2parent Mother work","2 Parent Neither work", "Father Father work","Father Father Unemployed", "Mother Mother works", "Mother Mother unemployed"))

table(census$employ_status_parent)

## 
##                       NA       2 Parent both Work     2 Parent Father work 
##                        0                      811                      257 
##      2parent Mother work    2 Parent Neither work       Father Father work 
##                       63                       20                       99 
## Father Father Unemployed      Mother Mother works Mother Mother unemployed 
##                       26                      373                       67

census$vehicle_no <- factor(census$vehicle_no,levels = c(0,1,2,3,4,5,6),labels=c("NA","Drive Alone","2 Person Carpool","3 Person Carpool","4 Person Carpool", "5 6 Person Carpool","7 or more Carpool"))

table(census$vehicle_no)

## 
##                 NA        Drive Alone   2 Person Carpool   3 Person Carpool 
##                  0              18621               1572                303 
##   4 Person Carpool 5 6 Person Carpool  7 or more Carpool 
##                120                 63                 29

census$marital_status <- factor(census$marital_status, levels = c(1,2,3,4,5),labels=c("Married","Widowed","Divorced","Separated", "Never Married under 15" ))

table(census$marital_status)

## 
##                Married                Widowed               Divorced 
##                  24871                   3644                   4910 
##              Separated Never Married under 15 
##                    925                  13615

census$relationship <- factor( census$relationship,
 levels = c(20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38),
 labels = c("Reference person", "husband/wife", "unmarried partner", "Same-sex husband/wife",
 "Same-sex unmarried partner", "son or daughter", "Adopted son or daughter", "Stepson or stepdaughter",
 "Brother or sister", "Father or mother", "Grandchild", "Parent-in-law", "Son-in-law or daughter-in-law",
 "Other relative", "Roommate or housemate", "Foster child", "Other nonrelative", "Institutionalized group quarters population", "Noninstitutionalized group quarters population"))

census$occupation <- factor(census$occupation)
str(census)

## 'data.frame':    47965 obs. of  13 variables:
##  $ age                 : int  54 27 72 19 57 21 72 40 78 20 ...
##  $ workclass           : Factor w/ 10 levels "< 16","Private for profit",..: NA 6 NA 2 NA 3 NA NA NA 2 ...
##  $ emp                 : Factor w/ 7 levels "<16","Civilian employed",..: 7 5 7 2 7 3 7 7 7 7 ...
##  $ employ_status_parent: Factor w/ 9 levels "NA","2 Parent both Work",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ vehicle_no          : Factor w/ 7 levels "NA","Drive Alone",..: NA 4 NA NA NA NA NA NA NA NA ...
##  $ marital_status      : Factor w/ 5 levels "Married","Widowed",..: 5 5 1 5 5 5 3 5 5 5 ...
##  $ occupation          : Factor w/ 524 levels "10","20","40",..: NA 468 NA 277 NA 324 NA NA NA 295 ...
##  $ relationship        : Factor w/ 19 levels "Reference person",..: 19 19 18 19 19 19 18 18 19 19 ...
##  $ race                : Factor w/ 9 levels "White","Black or African American",..: 1 8 2 1 2 2 9 1 1 2 ...
##  $ sex                 : Factor w/ 2 levels "Male","Female": 1 1 1 2 2 1 2 1 2 2 ...
##  $ usual_hour_worked   : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ hours_work          : int  NA 40 NA 15 NA 34 NA NA NA 30 ...
##  $ total_income        : Factor w/ 2 levels "< 50K",">=50K": 1 1 1 1 1 1 1 1 1 1 ...

summary(census)

##       age                     workclass    
##  Min.   :15.00   Private for profit:21195  
##  1st Qu.:33.00   Private non profit: 2391  
##  Median :52.00   Local Government  : 2370  
##  Mean   :50.44   Self Employed     : 2122  
##  3rd Qu.:67.00   State Government  : 1748  
##  Max.   :93.00   (Other)           : 2930  
##                  NA's              :15209  
##                             emp                      employ_status_parent
##  Civilian employed            :24172   2 Parent both Work      :  811    
##  Not in workforce             :21257   Mother Mother works     :  373    
##  Unemployed                   : 1007   2 Parent Father work    :  257    
##  Civilian employed/not at work:  472   Father Father work      :   99    
##  Armed Forces/at work         :  393   Mother Mother unemployed:   67    
##  (Other)                      :    3   (Other)                 :  109    
##  NA's                         :  661   NA's                    :46249    
##               vehicle_no                   marital_status    occupation   
##  Drive Alone       :18621   Married               :24871   440    :  875  
##  2 Person Carpool  : 1572   Widowed               : 3644   4720   :  799  
##  3 Person Carpool  :  303   Divorced              : 4910   2310   :  714  
##  4 Person Carpool  :  120   Separated             :  925   3255   :  701  
##  5 6 Person Carpool:   63   Never Married under 15:13615   9130   :  690  
##  (Other)           :   29                                  (Other):28977  
##  NA's              :27257                                  NA's   :15209  
##                                          relationship  
##  Reference person                              :22891  
##  husband/wife                                  :11614  
##  son or daughter                               : 5187  
##  Noninstitutionalized group quarters population: 1824  
##  Institutionalized group quarters population   : 1362  
##  unmarried partner                             : 1121  
##  (Other)                                       : 3966  
##                         race           sex        usual_hour_worked
##  White                    :33989   Male  :22914   Min.   :0.0000   
##  Black or African American: 9169   Female:25051   1st Qu.:0.0000   
##  Two or More Races        : 2692                  Median :0.0000   
##  Some Other Race alone    :  998                  Mean   :0.1021   
##  Asian                    :  757                  3rd Qu.:0.0000   
##  American Indian          :  255                  Max.   :1.0000   
##  (Other)                  :  105                                   
##    hours_work    total_income 
##  Min.   : 1.00   < 50K:33039  
##  1st Qu.:34.00   >=50K:14926  
##  Median :40.00                
##  Mean   :37.79                
##  3rd Qu.:40.00                
##  Max.   :99.00                
##  NA's   :20112

library(mdsr)
library(dplyr)
library(mosaic)

## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2

## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.

## 
## Attaching package: 'mosaic'

## The following object is masked from 'package:Matrix':
## 
##     mean

## The following object is masked from 'package:ggplot2':
## 
##     stat

## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally

## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var

## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum

set.seed(364)
n <- nrow(census)
test_idx <- sample.int(n, size = round(0.2 * n))
train <- census[-test_idx, ]
nrow(train)

## [1] 38372

test <- census[test_idx, ]
nrow(test)

## [1] 9593

tally(~total_income, data = train, format = "percent")

## total_income
##    < 50K    >=50K 
## 68.93568 31.06432

library(magrittr)
library(dplyr)
library(ggplot2)
library(rpart)

rpart.control(minsplit = 2, minbucket = 1, cp = 0.001)

## $minsplit
## [1] 2
## 
## $minbucket
## [1] 1
## 
## $cp
## [1] 0.001
## 
## $maxcompete
## [1] 4
## 
## $maxsurrogate
## [1] 5
## 
## $usesurrogate
## [1] 2
## 
## $surrogatestyle
## [1] 0
## 
## $maxdepth
## [1] 30
## 
## $xval
## [1] 10

rpart(total_income ~ usual_hour_worked, data = train, control = rpart.control())

## n= 38372 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 38372 11920 < 50K (0.6893568 0.3106432) *

dat_train <- as.formula("total_income ~ age + workclass + emp + employ_status_parent + vehicle_no + race + sex + usual_hour_worked + hours_work ")

 model_tree <- rpart(dat_train, data = train)
 model_tree

## n= 38372 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 38372 11920 < 50K (0.6893568 0.3106432)  
##    2) emp=Civilian employed/not at work,Unemployed,Armed Forces/not at work,Not in workforce 18208  2621 < 50K (0.8560523 0.1439477) *
##    3) emp=Civilian employed,Armed Forces/at work 20164  9299 < 50K (0.5388316 0.4611684)  
##      6) age< 24.5 3072   173 < 50K (0.9436849 0.0563151) *
##      7) age>=24.5 17092  7966 >=50K (0.4660660 0.5339340)  
##       14) hours_work< 39.5 4028  1099 < 50K (0.7271599 0.2728401) *
##       15) hours_work>=39.5 13064  5037 >=50K (0.3855634 0.6144366)  
##         30) race=Black or African American,American Indian,American Indian and Alaska Native,Some Other Race alone 2485  1024 < 50K (0.5879276 0.4120724) *
##         31) race=White,Asian,Native Hawaiian and Other Pacific Islander,Two or More Races 10579  3576 >=50K (0.3380282 0.6619718) *

 plot(model_tree)
 text(model_tree, use.n = TRUE, all= TRUE, cex = 0.5)

## Check Decision Tree Classifier (3)
 library(rpart.plot)
 rpart.plot(model_tree)

 library(partykit)

## Loading required package: grid

## Loading required package: libcoin

## Loading required package: mvtnorm

 plot(as.party(model_tree))

#complexity parameter (cp) table with relative error and cross-validation error
printcp(model_tree)

## 
## Classification tree:
## rpart(formula = dat_train, data = train)
## 
## Variables actually used in tree construction:
## [1] age        emp        hours_work race      
## 
## Root node error: 11920/38372 = 0.31064
## 
## n= 38372 
## 
##         CP nsplit rel error  xerror      xstd
## 1 0.083613      0   1.00000 1.00000 0.0076047
## 2 0.036661      3   0.74916 0.74916 0.0069443
## 3 0.010000      4   0.71250 0.71435 0.0068286

#Confusion matrix
 train <- train %>%
 mutate(income_tree = predict(model_tree, type = "class"))
 confusion <- tally(income_tree ~ total_income, data = train, format = "count")
 confusion

##            total_income
## income_tree < 50K >=50K
##       < 50K 22876  4917
##       >=50K  3576  7003

 accuracy <- sum(diag(confusion)) / nrow(train)
 accuracy

## [1] 0.7786667

#Prediction in test dataset
# Confusion matrix

 test <- test %>%
 mutate(income_tree_test = predict(model_tree, type = "class", newdata = test))
 confusion_test <- tally(income_tree_test ~ total_income, data = test, format = "count")
 confusion_test

##                 total_income
## income_tree_test < 50K >=50K
##            < 50K  5742  1235
##            >=50K   845  1771

accuracy_test <- sum(diag(confusion_test)) / nrow(test)
accuracy_test

## [1] 0.7831752

 # TUNING PARAMETERS
 model_tree2 <- rpart(dat_train, data = test, control = rpart.control(cp = 0.002))
 test <- test %>%
 mutate(income_tree2 = predict(model_tree2, type = "class"))
 confusion_test_tune <- tally(income_tree2 ~ total_income, data = test, format = "count")
 confusion_test_tune

##             total_income
## income_tree2 < 50K >=50K
##        < 50K  5650  1102
##        >=50K   937  1904

 accuracy_test_tune <- sum(diag(confusion_test_tune)) / nrow(test)
 accuracy_test_tune

## [1] 0.7874492

US CENSUS DATA

Maxine Harlemon

2025-11-18