Setup

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)
library(tidytext)
library(caret)
library(fastDummies)
library(randomForest)
library(broom)
library(factoextra)
# https://www.openml.org/d/1590

raw_income = read_csv("/Users/rochellerafn/RStudio Files/openml_1590.csv", na=c("?"))

income = read_csv("/Users/rochellerafn/RStudio Files/openml_1590.csv", na=c("?")) %>%
  drop_na() %>%
  mutate(income_above_50k = class==">50K") %>%
  select(-class) %>%
  dummy_cols(remove_selected_columns = T) 

head(income)

Some questions about the data

Please try at least a few of the following:

Please be prepared to

You can mostly ignore this… I just changed the name of some of the columns just to make it easier for me to search and and plot in ggplot to get familiar with the data set. These are just some random variables that I chose to look at initially.
income_2 <- income

names(income_2)[3]<- "education_num"
names(income_2)[4]<- "capital_gain"
names(income_2)[5]<- "capital_loss"
names(income_2)[6]<- "hours_per_week"
names(income_2)[8]<- "workclass_fed_gov"
names(income_2)[9]<- "workclass_loc_gov"
names(income_2)[11]<- "workclass_self_emp_inc"
names(income_2)[12]<- "workclass_self_emp_not_inc"
names(income_2)[13]<- "workclass_state_gov"
names(income_2)[14]<- "workclass_without_pay"


head(income_2)
Scaling the numeric variables to make sure we’re able to compare them against eachother on an even scale.
income_2 <- income_2 %>%
  mutate(ages = scale(age, center = TRUE, scale = TRUE)) %>%
  mutate(education_nums = scale(education_num, center = TRUE, scale = TRUE)) %>%
  mutate(capital_gains = scale(capital_gain, center = TRUE, scale = TRUE)) %>%
  mutate(capital_losss = scale(capital_loss, center = TRUE, scale = TRUE)) %>%
  mutate(hours_per_weeks = scale(hours_per_week, center = TRUE, scale = TRUE)) %>%
  relocate("ages", "education_nums", "capital_gains", "capital_losss", "hours_per_weeks") %>%
  select(-age, -fnlwgt, -education_num, -capital_gain, -capital_loss, -hours_per_week)

head(income_2)

K-Means Cluster Analysis

Taking a look at the clusters to determine how many clusters to analyze. After doing some research we felt that we got the clearest clusters between age and education level. As we move on further in this process you’ll see that age and education level continue to be meaningful factors. Every time I ran this the clusters would change… but generally you could see the clearest clusters from 6-8.
kclusts <- tibble(k = 1:9) %>%
  mutate(
    kclust = map(k, ~kmeans(income_2, .x)),
    glanced = map(kclust, glance),
    augmented = map(kclust, augment, income_2)
  )

clusterings <- kclusts %>% 
  unnest(glanced, .drop = TRUE)

ggplot(clusterings, aes(k, tot.withinss)) +
  geom_line()

## scatter plots
assignments <- kclusts %>% 
  unnest(augmented)

ggplot(assignments, aes(ages, education_nums)) +
  geom_point(aes(color = .cluster), alpha=0.3) +
  theme(strip.background = element_blank(),
        panel.background = element_blank(),
        panel.grid.minor = element_blank(),
        plot.title = element_text(),
        legend.key = element_blank())+
  labs(colour = "Clusters")+
  xlab("Age")+
  ylab("Education Level")+
  facet_wrap(~ k)

K-Means Cluster
set.seed(504)
kclust <- kmeans(income_2, centers = 7)
kclust$centers
##         ages education_nums capital_gains capital_losss hours_per_weeks
## 1  0.3772586     1.32245583    0.09976050    -0.2184995       0.4204515
## 2 -1.2143231    -0.30710965   -0.13424950    -0.2137009      -1.6856221
## 3  0.2282063    -0.27496499   -0.06157595    -0.2187778       0.4010987
## 4  0.5333617    -2.17297916   -0.11546797    -0.2132712      -0.1115023
## 5 -0.8163350     0.03768491   -0.12061148    -0.2176108       0.1248304
## 6  0.2835291     0.44501779    1.17162940     4.0119987       0.2976181
## 7  0.7919683    -0.11755477   -0.09423801    -0.2115712      -0.3455048
##   income_above_50k workclass_fed_gov workclass_loc_gov workclass_Private
## 1       0.64189013       0.049726634        0.12106222         0.5673002
## 2       0.01309035       0.010010267        0.03388090         0.8603696
## 3       0.35044044       0.031692838        0.05697051         0.7057641
## 4       0.06099152       0.008153947        0.04403131         0.8114808
## 5       0.03054449       0.021627775        0.05036995         0.8441472
## 6       0.57260156       0.035868626        0.07649092         0.6577355
## 7       0.10057708       0.043693322        0.08244023         0.7305578
##   workclass_self_emp_inc workclass_self_emp_not_inc workclass_state_gov
## 1            0.075110648                 0.11090862          0.07563135
## 2            0.009753593                 0.03439425          0.05082136
## 3            0.049023363                 0.12619686          0.02996936
## 4            0.016634051                 0.10404436          0.01500326
## 5            0.012236767                 0.03708974          0.03424398
## 6            0.082541054                 0.10890233          0.03802939
## 7            0.020335257                 0.07282220          0.04932674
##   workclass_without_pay education_1st-4th education_5th-6th education_7th-8th
## 1          0.0002603489      0.0000000000       0.000000000      0.000000e+00
## 2          0.0007700205      0.0002566735       0.001283368      3.080082e-03
## 3          0.0003829950      0.0000000000       0.000000000      2.872463e-04
## 4          0.0006523157      0.0701239400       0.139595564      2.557078e-01
## 5          0.0002845760      0.0000000000       0.000000000      9.485866e-05
## 6          0.0004321521      0.0025929127       0.006914434      9.939499e-03
## 7          0.0008244023      0.0000000000       0.000000000      0.000000e+00
##   education_9th education_10th education_11th education_12th
## 1  0.0000000000    0.000000000     0.00000000    0.000000000
## 2  0.0141170431    0.057238193     0.13321355    0.044147844
## 3  0.0004787438    0.004117196     0.02862888    0.014266565
## 4  0.1898238748    0.237769080     0.08251794    0.001630789
## 5  0.0017074559    0.012805919     0.03063935    0.014039082
## 6  0.0060501296    0.015989628     0.01858254    0.004321521
## 7  0.0002748008    0.007694422     0.02500687    0.012778236
##   education_Assoc-acdm education_Assoc-voc education_Bachelors
## 1           0.05584483         0.005727675          0.56873210
## 2           0.02515400         0.015143737          0.07828542
## 3           0.01455381         0.074492532          0.00000000
## 4           0.00000000         0.000000000          0.00000000
## 5           0.04287611         0.055207740          0.17804971
## 6           0.03154710         0.032411409          0.23811582
## 7           0.04163232         0.057845562          0.06430338
##   education_Doctorate education_HS-grad education_Masters education_Preschool
## 1        0.0577974486         0.0000000        0.23535538        0.0000000000
## 2        0.0012833676         0.2109856        0.01129363        0.0002566735
## 3        0.0000000000         0.5520873        0.00000000        0.0000000000
## 4        0.0000000000         0.0000000        0.00000000        0.0228310502
## 5        0.0001897173         0.3852210        0.02361981        0.0000000000
## 6        0.0375972342         0.2476232        0.10414866        0.0004321521
## 7        0.0008244023         0.4892828        0.02363287        0.0000000000
##   education_Prof-school education_Some-college marital-status_Divorced
## 1           0.076542567              0.0000000              0.09945327
## 2           0.002310062              0.4019507              0.02977413
## 3           0.000000000              0.3110877              0.01034087
## 4           0.000000000              0.0000000              0.11676451
## 5           0.001897173              0.2536521              0.09343578
## 6           0.068280035              0.1754538              0.10371651
## 7           0.001374004              0.2753504              0.51195383
##   marital-status_Married-AF-spouse marital-status_Married-civ-spouse
## 1                     0.0005206977                        0.75878677
## 2                     0.0012833676                        0.07058522
## 3                     0.0006702413                        0.98113750
## 4                     0.0003261579                        0.55381605
## 5                     0.0007588693                        0.03367482
## 6                     0.0008643042                        0.65687122
## 7                     0.0006870019                        0.15539984
##   marital-status_Married-spouse-absent marital-status_Never-married
## 1                          0.010023431                  0.103749024
## 2                          0.008213552                  0.871406571
## 3                          0.001723478                  0.001819226
## 4                          0.031637312                  0.182322244
## 5                          0.012331626                  0.819863404
## 6                          0.007778738                  0.191011236
## 7                          0.024732069                  0.102088486
##   marital-status_Separated marital-status_Widowed occupation_Adm-clerical
## 1               0.01666233            0.010804478              0.04751367
## 2               0.01668378            0.002053388              0.15734086
## 3               0.00306396            0.001244734              0.05524703
## 4               0.05609915            0.059034573              0.02804958
## 5               0.03784861            0.002086891              0.17302220
## 6               0.01815039            0.021607606              0.08124460
## 7               0.07873042            0.126408354              0.25927453
##   occupation_Armed-Forces occupation_Craft-repair occupation_Exec-managerial
## 1            0.0002603489              0.04074460                 0.27636032
## 2            0.0002566735              0.03439425                 0.02541068
## 3            0.0002872463              0.27317120                 0.11959020
## 4            0.0000000000              0.20026093                 0.03163731
## 5            0.0006640106              0.12141909                 0.09618668
## 6            0.0004321521              0.11754538                 0.21261884
## 7            0.0000000000              0.07611981                 0.12503435
##   occupation_Farming-fishing occupation_Handlers-cleaners
## 1                 0.01523041                  0.005988024
## 2                 0.01899384                  0.079825462
## 3                 0.05141708                  0.043182689
## 4                 0.09784736                  0.092628832
## 5                 0.02731929                  0.067254790
## 6                 0.02117545                  0.023768366
## 7                 0.01580104                  0.026106073
##   occupation_Machine-op-inspct occupation_Other-service
## 1                  0.008461338               0.01562093
## 2                  0.022843943               0.31108830
## 3                  0.091248564               0.04595940
## 4                  0.167971298               0.18460535
## 5                  0.074464049               0.11686587
## 6                  0.042783060               0.04494382
## 7                  0.063753779               0.15031602
##   occupation_Priv-house-serv occupation_Prof-specialty
## 1               0.0002603489               0.403150221
## 2               0.0084702259               0.078028747
## 3               0.0003829950               0.038395251
## 4               0.0205479452               0.008806262
## 5               0.0037943464               0.097040410
## 6               0.0021607606               0.223422645
## 7               0.0116790327               0.087798846
##   occupation_Protective-serv occupation_Sales occupation_Tech-support
## 1                0.019265816       0.12796147             0.028638375
## 2                0.008983573       0.20944559             0.026180698
## 3                0.036384527       0.11145155             0.029107622
## 4                0.011089367       0.04761905             0.003587736
## 5                0.022196927       0.11715045             0.042212104
## 6                0.020311150       0.13396716             0.031979257
## 7                0.013465238       0.10359989             0.036273702
##   occupation_Transport-moving relationship_Husband relationship_Not-in-family
## 1                  0.01054413         0.6851080448                0.178989846
## 2                  0.01873717         0.0236139630                0.159650924
## 3                  0.10417465         0.9716583684                0.009574876
## 4                  0.10534899         0.4983692107                0.216894977
## 5                  0.04040979         0.0001897173                0.507209258
## 6                  0.04364736         0.5916162489                0.221261884
## 7                  0.03077769         0.0362737016                0.423330585
##   relationship_Other-relative relationship_Own-child relationship_Unmarried
## 1                 0.007550117             0.01431919            0.045561052
## 2                 0.054158111             0.66298768            0.061601643
## 3                 0.004595940             0.00382995            0.005744925
## 4                 0.064579256             0.05512068            0.125244618
## 5                 0.051128818             0.31274900            0.102542212
## 6                 0.015989628             0.06352636            0.046240277
## 7                 0.035449299             0.03847211            0.352569387
##   relationship_Wife race_Amer-Indian-Eskimo race_Asian-Pac-Islander race_Black
## 1        0.06847175             0.003514710              0.04699297 0.04113512
## 2        0.03798768             0.008726899              0.03182752 0.09496920
## 3        0.00459594             0.010053619              0.01895825 0.05591727
## 4        0.03979126             0.010763209              0.02315721 0.11904762
## 5        0.02618099             0.013280212              0.03054449 0.12919750
## 6        0.06136560             0.004321521              0.03197926 0.06136560
## 7        0.11390492             0.011816433              0.02102226 0.14962902
##    race_Other race_White  sex_Female  sex_Male native-country_Cambodia
## 1 0.004425931  0.9039313 0.155688623 0.8443114            0.0005206977
## 2 0.008983573  0.8554928 0.575718686 0.4242813            0.0000000000
## 3 0.005170433  0.9099004 0.005840674 0.9941593            0.0008617388
## 4 0.024461840  0.8225701 0.231898239 0.7681018            0.0009784736
## 5 0.010339594  0.8166382 0.420508442 0.5794916            0.0004742933
## 6 0.006050130  0.8962835 0.215211755 0.7847882            0.0008643042
## 7 0.004396812  0.8131355 0.762984336 0.2370157            0.0004122012
##   native-country_Canada native-country_China native-country_Columbia
## 1           0.005597501          0.005857850             0.001171570
## 2           0.003336756          0.002053388             0.002053388
## 3           0.002680965          0.001244734             0.001436231
## 4           0.003913894          0.003913894             0.003913894
## 5           0.002371467          0.001138304             0.001897173
## 6           0.005617978          0.005185825             0.001296456
## 7           0.003984611          0.001511404             0.002061006
##   native-country_Cuba native-country_Dominican-Republic native-country_Ecuador
## 1         0.003254361                      0.0007810466           0.0007810466
## 2         0.000513347                      0.0005133470           0.0010266940
## 3         0.002489468                      0.0013404826           0.0008617388
## 4         0.009132420                      0.0130463144           0.0022831050
## 5         0.001517739                      0.0021817492           0.0012331626
## 6         0.002160761                      0.0012964564           0.0000000000
## 7         0.004259412                      0.0012366035           0.0005496015
##   native-country_El-Salvador native-country_England native-country_France
## 1               0.0013017443            0.005467326          0.0022129654
## 2               0.0041067762            0.002566735          0.0005133470
## 3               0.0016277288            0.001244734          0.0001914975
## 4               0.0215264188            0.001304631          0.0003261579
## 5               0.0024663252            0.001992032          0.0006640106
## 6               0.0008643042            0.003025065          0.0000000000
## 7               0.0013740038            0.003022808          0.0009618027
##   native-country_Germany native-country_Greece native-country_Guatemala
## 1            0.006118198          0.0010413955             0.0000000000
## 2            0.003080082          0.0010266940             0.0015400411
## 3            0.004117196          0.0017234776             0.0009574876
## 4            0.001630789          0.0019569472             0.0143509459
## 5            0.004173781          0.0004742933             0.0018971732
## 6            0.003025065          0.0017286085             0.0004321521
## 7            0.004809013          0.0005496015             0.0006870019
##   native-country_Haiti native-country_Holand-Netherlands
## 1         0.0005206977                      0.0000000000
## 2         0.0015400411                      0.0000000000
## 3         0.0010532363                      0.0000000000
## 4         0.0048923679                      0.0000000000
## 5         0.0015177386                      0.0000000000
## 6         0.0012964564                      0.0004321521
## 7         0.0019236054                      0.0000000000
##   native-country_Honduras native-country_Hong native-country_Hungary
## 1            0.0001301744        0.0009112210           0.0006508722
## 2            0.0005133470        0.0000000000           0.0002566735
## 3            0.0000000000        0.0002872463           0.0004787438
## 4            0.0016307893        0.0013046314           0.0000000000
## 5            0.0002845760        0.0007588693           0.0001897173
## 6            0.0008643042        0.0017286085           0.0004321521
## 7            0.0008244023        0.0002748008           0.0005496015
##   native-country_India native-country_Iran native-country_Ireland
## 1         0.0098932570        0.0039052330           0.0005206977
## 2         0.0043634497        0.0007700205           0.0000000000
## 3         0.0010532363        0.0004787438           0.0007659900
## 4         0.0016307893        0.0000000000           0.0006523157
## 5         0.0018023146        0.0009485866           0.0015177386
## 6         0.0056179775        0.0017286085           0.0008643042
## 7         0.0008244023        0.0005496015           0.0005496015
##   native-country_Italy native-country_Jamaica native-country_Japan
## 1         0.0026034887           0.0011715699         0.0045561052
## 2         0.0012833676           0.0020533881         0.0012833676
## 3         0.0024894676           0.0018192264         0.0007659900
## 4         0.0088062622           0.0022831050         0.0006523157
## 5         0.0007588693           0.0036046291         0.0021817492
## 6         0.0012964564           0.0004321521         0.0017286085
## 7         0.0015114042           0.0028854081         0.0016488046
##   native-country_Laos native-country_Mexico native-country_Nicaragua
## 1        0.0002603489           0.004946628             0.0003905233
## 2        0.0000000000           0.010523614             0.0020533881
## 3        0.0005744925           0.011489851             0.0006702413
## 4        0.0016307893           0.163078930             0.0019569472
## 5        0.0004742933           0.014797951             0.0014228799
## 6        0.0004321521           0.007346586             0.0008643042
## 7        0.0002748008           0.004259412             0.0009618027
##   native-country_Outlying-US(Guam-USVI-etc) native-country_Peru
## 1                              0.0001301744        0.0007810466
## 2                              0.0000000000        0.0010266940
## 3                              0.0001914975        0.0006702413
## 4                              0.0006523157        0.0013046314
## 5                              0.0009485866        0.0014228799
## 6                              0.0000000000        0.0004321521
## 7                              0.0009618027        0.0010992031
##   native-country_Philippines native-country_Poland native-country_Portugal
## 1                0.009893257          0.0016922676            0.0002603489
## 2                0.004363450          0.0010266940            0.0005133470
## 3                0.004212945          0.0022022214            0.0017234776
## 4                0.007175473          0.0019569472            0.0091324201
## 5                0.006165813          0.0015177386            0.0007588693
## 6                0.006482282          0.0008643042            0.0000000000
## 7                0.006045617          0.0023358065            0.0005496015
##   native-country_Puerto-Rico native-country_Scotland native-country_South
## 1                0.001562093            0.0003905233         0.0033845353
## 2                0.002823409            0.0002566735         0.0030800821
## 3                0.002680965            0.0003829950         0.0019149751
## 4                0.014024788            0.0003261579         0.0003261579
## 5                0.003509770            0.0004742933         0.0020868905
## 6                0.003457217            0.0000000000         0.0025929127
## 7                0.004946414            0.0008244023         0.0019236054
##   native-country_Taiwan native-country_Thailand native-country_Trinadad&Tobago
## 1          0.0035147097            0.0010413955                   0.0003905233
## 2          0.0007700205            0.0005133470                   0.0002566735
## 3          0.0003829950            0.0006702413                   0.0004787438
## 4          0.0000000000            0.0003261579                   0.0013046314
## 5          0.0014228799            0.0004742933                   0.0005691520
## 6          0.0017286085            0.0004321521                   0.0008643042
## 7          0.0002748008            0.0006870019                   0.0006870019
##   native-country_United-States native-country_Vietnam native-country_Yugoslavia
## 1                    0.9107003            0.001041395              0.0006508722
## 2                    0.9366016            0.001796715              0.0000000000
## 3                    0.9395825            0.001627729              0.0005744925
## 4                    0.6924331            0.002935421              0.0013046314
## 5                    0.9245874            0.002845760              0.0004742933
## 6                    0.9308557            0.001728608              0.0000000000
## 7                    0.9356966            0.001099203              0.0004122012
glance(kclust)
I also found this library with an interet search as another method to graphing clusters. So, I decided to give it a try.
fviz_cluster(kclust, income_2, ellipse.type = "norm", repel = TRUE, ggtheme = theme_minimal())

inc <- augment(kclust,income_2)
head(inc)
Analyzing the clusters based on age and years of education. You can see some separations, but it’s still kind of a lot of noise.
inc %>%
  pivot_longer(c(education_nums, ages),names_to = "feature") %>%
  ggplot(aes(value, fill=.cluster))+
  geom_density(alpha=0.3)+
  theme(strip.background = element_blank(),
        panel.background = element_blank(),
        panel.grid.minor = element_blank(),
        axis.title = element_blank(),
        plot.title = element_text(),
        plot.background = element_blank())+
  facet_wrap(~feature, labeller = labeller(feature = 
                                             c("education_nums" = "Years of Education",
                                             "ages" = "Age")))+
  labs(fill = "Clusters")

PCA

Now trying a PCA analysis to see if we can find anything that sticks out more clearly.
set.seed(504)
pr_income <- prcomp(x = select(income,-income_above_50k), scale = T, center = T)
summary(pr_income)
## Importance of components:
##                            PC1     PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.13511 1.76172 1.61592 1.53271 1.39315 1.33405 1.30566
## Proportion of Variance 0.04383 0.02984 0.02511 0.02259 0.01866 0.01711 0.01639
## Cumulative Proportion  0.04383 0.07368 0.09878 0.12137 0.14004 0.15715 0.17354
##                            PC8     PC9    PC10   PC11    PC12   PC13    PC14
## Standard deviation     1.25584 1.19933 1.18169 1.1494 1.13792 1.1170 1.10488
## Proportion of Variance 0.01516 0.01383 0.01343 0.0127 0.01245 0.0120 0.01174
## Cumulative Proportion  0.18870 0.20253 0.21596 0.2287 0.24112 0.2531 0.26485
##                           PC15    PC16   PC17    PC18    PC19    PC20    PC21
## Standard deviation     1.09901 1.09042 1.0841 1.08221 1.06747 1.06548 1.05522
## Proportion of Variance 0.01161 0.01143 0.0113 0.01126 0.01096 0.01092 0.01071
## Cumulative Proportion  0.27646 0.28790 0.2992 0.31046 0.32141 0.33233 0.34304
##                           PC22    PC23    PC24    PC25    PC26    PC27    PC28
## Standard deviation     1.05174 1.04434 1.04271 1.03857 1.03821 1.03696 1.03227
## Proportion of Variance 0.01064 0.01049 0.01045 0.01037 0.01036 0.01034 0.01025
## Cumulative Proportion  0.35367 0.36416 0.37462 0.38499 0.39535 0.40569 0.41594
##                           PC29    PC30   PC31    PC32    PC33    PC34    PC35
## Standard deviation     1.03078 1.02803 1.0251 1.02333 1.02189 1.01949 1.01715
## Proportion of Variance 0.01022 0.01016 0.0101 0.01007 0.01004 0.00999 0.00995
## Cumulative Proportion  0.42615 0.43631 0.4464 0.45649 0.46653 0.47652 0.48647
##                           PC36    PC37    PC38   PC39    PC40    PC41    PC42
## Standard deviation     1.01566 1.01413 1.01171 1.0098 1.00831 1.00479 1.00383
## Proportion of Variance 0.00992 0.00989 0.00984 0.0098 0.00978 0.00971 0.00969
## Cumulative Proportion  0.49639 0.50628 0.51612 0.5259 0.53570 0.54541 0.55510
##                           PC43    PC44    PC45    PC46    PC47    PC48    PC49
## Standard deviation     1.00217 1.00194 1.00182 1.00146 1.00120 1.00097 1.00075
## Proportion of Variance 0.00966 0.00965 0.00965 0.00964 0.00964 0.00963 0.00963
## Cumulative Proportion  0.56475 0.57441 0.58406 0.59370 0.60334 0.61297 0.62260
##                           PC50    PC51    PC52    PC53    PC54   PC55    PC56
## Standard deviation     1.00052 1.00018 1.00015 1.00005 0.99978 0.9993 0.99878
## Proportion of Variance 0.00963 0.00962 0.00962 0.00962 0.00961 0.0096 0.00959
## Cumulative Proportion  0.63223 0.64185 0.65147 0.66108 0.67069 0.6803 0.68989
##                           PC57    PC58    PC59    PC60    PC61    PC62    PC63
## Standard deviation     0.99798 0.99728 0.99662 0.99488 0.99467 0.99372 0.99318
## Proportion of Variance 0.00958 0.00956 0.00955 0.00952 0.00951 0.00949 0.00948
## Cumulative Proportion  0.69946 0.70903 0.71858 0.72809 0.73761 0.74710 0.75659
##                           PC64    PC65    PC66    PC67    PC68    PC69    PC70
## Standard deviation     0.99052 0.98995 0.98801 0.98608 0.98538 0.98314 0.98156
## Proportion of Variance 0.00943 0.00942 0.00939 0.00935 0.00934 0.00929 0.00926
## Cumulative Proportion  0.76602 0.77544 0.78483 0.79418 0.80352 0.81281 0.82207
##                           PC71    PC72    PC73    PC74    PC75    PC76    PC77
## Standard deviation     0.98094 0.97880 0.97661 0.96924 0.96591 0.95795 0.95564
## Proportion of Variance 0.00925 0.00921 0.00917 0.00903 0.00897 0.00882 0.00878
## Cumulative Proportion  0.83133 0.84054 0.84971 0.85874 0.86771 0.87654 0.88532
##                           PC78    PC79    PC80    PC81    PC82    PC83    PC84
## Standard deviation     0.95375 0.94992 0.93892 0.93181 0.92806 0.91281 0.89884
## Proportion of Variance 0.00875 0.00868 0.00848 0.00835 0.00828 0.00801 0.00777
## Cumulative Proportion  0.89407 0.90274 0.91122 0.91957 0.92785 0.93586 0.94363
##                           PC85    PC86    PC87    PC88    PC89    PC90    PC91
## Standard deviation     0.89134 0.85504 0.85248 0.81285 0.78871 0.77356 0.73836
## Proportion of Variance 0.00764 0.00703 0.00699 0.00635 0.00598 0.00575 0.00524
## Cumulative Proportion  0.95127 0.95830 0.96529 0.97164 0.97762 0.98337 0.98862
##                           PC92    PC93    PC94    PC95     PC96      PC97
## Standard deviation     0.69594 0.65642 0.49845 0.14222 1.69e-14 8.356e-15
## Proportion of Variance 0.00466 0.00414 0.00239 0.00019 0.00e+00 0.000e+00
## Cumulative Proportion  0.99327 0.99742 0.99981 1.00000 1.00e+00 1.000e+00
##                             PC98      PC99     PC100     PC101    PC102
## Standard deviation     6.635e-15 6.123e-15 5.397e-15 4.749e-15 4.24e-15
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.00e+00
## Cumulative Proportion  1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.00e+00
##                            PC103     PC104
## Standard deviation     3.738e-15 2.032e-15
## Proportion of Variance 0.000e+00 0.000e+00
## Cumulative Proportion  1.000e+00 1.000e+00
Based on the screeplot it looks like choosing 4 or 5 components is the most appropriate.
screeplot(pr_income, type = "lines")

Looking at the first 4 principal components
# find features w/ highest loadings per factor
rownames_to_column(as.data.frame(pr_income$rotation)) %>% 
  select(1:5) %>% 
  filter(abs(PC1) >= 0.25 | abs(PC2) >= 0.25 | abs(PC3) >= 0.25 | abs(PC4) >= 0.25)
Visualizing and naming the principal components
Well… looks like it switched back again lol… for some reason the PC1 and PC2 are switched back again… This is the PCA that was for some reason flip flopping measures of variables. So, the labels on these variables are only correct if the FALSE is the stronger plot to the right and TRUE is to the left on PC1 and PC2. Otherwise, PC1 actually = Married Men and PC2 is Low Education (basically the inverse of the current title).
prc <- bind_cols(select(income,income_above_50k),as.data.frame(pr_income$x)) %>% 
  select(1:5) %>% 
  rename("bachelorettes"=PC1, "high_education"=PC2, "white_usa"=PC3, "divorced_no_kids"=PC4)

prc %>% 
  select(income_above_50k, bachelorettes, high_education,
         white_usa, divorced_no_kids) %>% 
  pivot_longer(cols = -income_above_50k, names_to = "component",values_to = "loading") %>% 
  ggplot(aes(loading, fill= income_above_50k))+
  geom_density(alpha=0.4)+
  theme(strip.background = element_blank(),
        panel.background = element_blank(),
        panel.grid.minor = element_blank(),
        axis.title = element_blank(),
        plot.title = element_text())+
  scale_fill_manual(values = c("#7052F5", "#50E6D7"),
                    name = "Income Above $50k")+
  facet_wrap(.~component, labeller = labeller(component =
                                                c("divorced_no_kids" = "Mature Female Divorcée",
                                                  "high_education" = "Highly Educated American Women",
                                                  "bachelorettes" = " Young Low Educated Bachelorettes",
                                                  "white_usa" = "Young White Americans")))

Analyzing the biplot of variables
biplot(pr_income, choices = c(1,2), 
       col = c("grey90", "#50E6D7"),
       xlim=c(-0.020, 0.020), ylim=c(-0.006, 0.006))

Choosing features based on the PCA and biplot
income_model_whtmen <- income_2 %>%
  select(income_above_50k, education_nums, education_Bachelors, education_Masters, education_Doctorate, `occupation_Prof-specialty`, `native-country_United-States`, sex_Male, `marital-status_Married-civ-spouse`, relationship_Husband, workclass_Private, race_White)

income_model_sglwomen <- income_2 %>%
  select(income_above_50k, ages, sex_Female, education_nums, education_Masters, education_Doctorate, `occupation_Prof-specialty`, `occupation_Exec-managerial`, `native-country_United-States`, `marital-status_Never-married`, `marital-status_Divorced`, `relationship_Not-in-family`)
Meh.
income_model_whtmen$income_above_50k <- as.factor(income_model_whtmen$income_above_50k)

ctrl <- trainControl(method = "cv", number = 3, classProbs=TRUE, summaryFunction = twoClassSummary)

set.seed(504) 
income_index <- createDataPartition(income_model_whtmen$income_above_50k, p = 0.80, list = FALSE)
train <- income_model_whtmen[ income_index, ]
test <- income_model_whtmen[-income_index, ]

# example spec for rf
fit <- train(make.names(income_above_50k) ~ .,
             data = train, 
             method = "rf",
             tuneLength = 3,
             metric = "ROC",
             trControl = ctrl)

fit
## Random Forest 
## 
## 36179 samples
##    11 predictor
##     2 classes: 'FALSE.', 'TRUE.' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 24119, 24119, 24120 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##    2    0.7838102  0.9478172  0.4142969
##    6    0.7449016  0.9359106  0.4528828
##   11    0.7510866  0.9323461  0.4574551
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
confusionMatrix(predict(fit, test),factor(make.names(test$income_above_50k)))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE. TRUE.
##     FALSE.   6432  1281
##     TRUE.     370   960
##                                           
##                Accuracy : 0.8174          
##                  95% CI : (0.8093, 0.8253)
##     No Information Rate : 0.7522          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.433           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9456          
##             Specificity : 0.4284          
##          Pos Pred Value : 0.8339          
##          Neg Pred Value : 0.7218          
##              Prevalence : 0.7522          
##          Detection Rate : 0.7113          
##    Detection Prevalence : 0.8529          
##       Balanced Accuracy : 0.6870          
##                                           
##        'Positive' Class : FALSE.          
## 
library(pROC)

myRoc <- roc(test$income_above_50k, predict(fit, test, type="prob")[,2])

plot(myRoc)

Double, meh.
auc(myRoc)
## Area under the curve: 0.7771
Now attempting the model based more closely on the PC1 and single women. A little better! But still… not the “great” kappa I would have liked…
income_model_sglwomen$income_above_50k <- as.factor(income_model_sglwomen$income_above_50k)

ctrl2 <- trainControl(method = "cv", number = 3, classProbs=TRUE, summaryFunction = twoClassSummary)

set.seed(504) 
income_index2 <- createDataPartition(income_model_sglwomen$income_above_50k, p = 0.80, list = FALSE)
train2 <- income_model_sglwomen[ income_index2, ]
test2 <- income_model_sglwomen[-income_index2, ]

# example spec for rf
fit2 <- train(make.names(income_above_50k) ~ .,
             data = train2, 
             method = "rf",
             tuneLength = 3,
             metric = "ROC",
             trControl = ctrl2)

fit2
## Random Forest 
## 
## 36179 samples
##    11 predictor
##     2 classes: 'FALSE.', 'TRUE.' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 24119, 24119, 24120 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##    2    0.7997318  0.9414966  0.4500948
##    6    0.8163877  0.9173524  0.5203524
##   11    0.8113665  0.9066219  0.4931415
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 6.
confusionMatrix(predict(fit2, test2),factor(make.names(test2$income_above_50k)))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE. TRUE.
##     FALSE.   6214  1051
##     TRUE.     588  1190
##                                           
##                Accuracy : 0.8188          
##                  95% CI : (0.8107, 0.8266)
##     No Information Rate : 0.7522          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4777          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9136          
##             Specificity : 0.5310          
##          Pos Pred Value : 0.8553          
##          Neg Pred Value : 0.6693          
##              Prevalence : 0.7522          
##          Detection Rate : 0.6872          
##    Detection Prevalence : 0.8034          
##       Balanced Accuracy : 0.7223          
##                                           
##        'Positive' Class : FALSE.          
## 
myRoc2 <- roc(test2$income_above_50k, predict(fit2, test2, type="prob")[,2])

plot(myRoc2)

Better…
auc(myRoc2)
## Area under the curve: 0.8069
Finding the best tune.
print(fit2)
## Random Forest 
## 
## 36179 samples
##    11 predictor
##     2 classes: 'FALSE.', 'TRUE.' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 24119, 24119, 24120 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##    2    0.7997318  0.9414966  0.4500948
##    6    0.8163877  0.9173524  0.5203524
##   11    0.8113665  0.9066219  0.4931415
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 6.
print(fit2$bestTune)
##   mtry
## 2    6

Final Model

Running the final model with the best tune.
set.seed(504)
fit_final <- train(make.names(income_above_50k) ~ .,
             data = train2, 
             method = "rf",
             trControl = ctrl2,
             tuneLength = fit2$bestTune) 

myRoc3 <- roc(test2$income_above_50k, predict(fit_final, test2, type="prob")[,2])

plot(myRoc3)

auc(myRoc3)
## Area under the curve: 0.8093
confusionMatrix(predict(fit_final, test2),factor(make.names(test2$income_above_50k)))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE. TRUE.
##     FALSE.   6164  1050
##     TRUE.     638  1191
##                                           
##                Accuracy : 0.8133          
##                  95% CI : (0.8051, 0.8213)
##     No Information Rate : 0.7522          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4664          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9062          
##             Specificity : 0.5315          
##          Pos Pred Value : 0.8544          
##          Neg Pred Value : 0.6512          
##              Prevalence : 0.7522          
##          Detection Rate : 0.6816          
##    Detection Prevalence : 0.7977          
##       Balanced Accuracy : 0.7188          
##                                           
##        'Positive' Class : FALSE.          
##