knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)
library(tidytext)
library(caret)
library(fastDummies)
library(randomForest)
library(broom)
library(factoextra)
# https://www.openml.org/d/1590
raw_income = read_csv("/Users/rochellerafn/RStudio Files/openml_1590.csv", na=c("?"))
income = read_csv("/Users/rochellerafn/RStudio Files/openml_1590.csv", na=c("?")) %>%
drop_na() %>%
mutate(income_above_50k = class==">50K") %>%
select(-class) %>%
dummy_cols(remove_selected_columns = T)
head(income)
Please try at least a few of the following:
Please be prepared to
income_2 <- income
names(income_2)[3]<- "education_num"
names(income_2)[4]<- "capital_gain"
names(income_2)[5]<- "capital_loss"
names(income_2)[6]<- "hours_per_week"
names(income_2)[8]<- "workclass_fed_gov"
names(income_2)[9]<- "workclass_loc_gov"
names(income_2)[11]<- "workclass_self_emp_inc"
names(income_2)[12]<- "workclass_self_emp_not_inc"
names(income_2)[13]<- "workclass_state_gov"
names(income_2)[14]<- "workclass_without_pay"
head(income_2)
income_2 <- income_2 %>%
mutate(ages = scale(age, center = TRUE, scale = TRUE)) %>%
mutate(education_nums = scale(education_num, center = TRUE, scale = TRUE)) %>%
mutate(capital_gains = scale(capital_gain, center = TRUE, scale = TRUE)) %>%
mutate(capital_losss = scale(capital_loss, center = TRUE, scale = TRUE)) %>%
mutate(hours_per_weeks = scale(hours_per_week, center = TRUE, scale = TRUE)) %>%
relocate("ages", "education_nums", "capital_gains", "capital_losss", "hours_per_weeks") %>%
select(-age, -fnlwgt, -education_num, -capital_gain, -capital_loss, -hours_per_week)
head(income_2)
kclusts <- tibble(k = 1:9) %>%
mutate(
kclust = map(k, ~kmeans(income_2, .x)),
glanced = map(kclust, glance),
augmented = map(kclust, augment, income_2)
)
clusterings <- kclusts %>%
unnest(glanced, .drop = TRUE)
ggplot(clusterings, aes(k, tot.withinss)) +
geom_line()
## scatter plots
assignments <- kclusts %>%
unnest(augmented)
ggplot(assignments, aes(ages, education_nums)) +
geom_point(aes(color = .cluster), alpha=0.3) +
theme(strip.background = element_blank(),
panel.background = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(),
legend.key = element_blank())+
labs(colour = "Clusters")+
xlab("Age")+
ylab("Education Level")+
facet_wrap(~ k)
set.seed(504)
kclust <- kmeans(income_2, centers = 7)
kclust$centers
## ages education_nums capital_gains capital_losss hours_per_weeks
## 1 0.3772586 1.32245583 0.09976050 -0.2184995 0.4204515
## 2 -1.2143231 -0.30710965 -0.13424950 -0.2137009 -1.6856221
## 3 0.2282063 -0.27496499 -0.06157595 -0.2187778 0.4010987
## 4 0.5333617 -2.17297916 -0.11546797 -0.2132712 -0.1115023
## 5 -0.8163350 0.03768491 -0.12061148 -0.2176108 0.1248304
## 6 0.2835291 0.44501779 1.17162940 4.0119987 0.2976181
## 7 0.7919683 -0.11755477 -0.09423801 -0.2115712 -0.3455048
## income_above_50k workclass_fed_gov workclass_loc_gov workclass_Private
## 1 0.64189013 0.049726634 0.12106222 0.5673002
## 2 0.01309035 0.010010267 0.03388090 0.8603696
## 3 0.35044044 0.031692838 0.05697051 0.7057641
## 4 0.06099152 0.008153947 0.04403131 0.8114808
## 5 0.03054449 0.021627775 0.05036995 0.8441472
## 6 0.57260156 0.035868626 0.07649092 0.6577355
## 7 0.10057708 0.043693322 0.08244023 0.7305578
## workclass_self_emp_inc workclass_self_emp_not_inc workclass_state_gov
## 1 0.075110648 0.11090862 0.07563135
## 2 0.009753593 0.03439425 0.05082136
## 3 0.049023363 0.12619686 0.02996936
## 4 0.016634051 0.10404436 0.01500326
## 5 0.012236767 0.03708974 0.03424398
## 6 0.082541054 0.10890233 0.03802939
## 7 0.020335257 0.07282220 0.04932674
## workclass_without_pay education_1st-4th education_5th-6th education_7th-8th
## 1 0.0002603489 0.0000000000 0.000000000 0.000000e+00
## 2 0.0007700205 0.0002566735 0.001283368 3.080082e-03
## 3 0.0003829950 0.0000000000 0.000000000 2.872463e-04
## 4 0.0006523157 0.0701239400 0.139595564 2.557078e-01
## 5 0.0002845760 0.0000000000 0.000000000 9.485866e-05
## 6 0.0004321521 0.0025929127 0.006914434 9.939499e-03
## 7 0.0008244023 0.0000000000 0.000000000 0.000000e+00
## education_9th education_10th education_11th education_12th
## 1 0.0000000000 0.000000000 0.00000000 0.000000000
## 2 0.0141170431 0.057238193 0.13321355 0.044147844
## 3 0.0004787438 0.004117196 0.02862888 0.014266565
## 4 0.1898238748 0.237769080 0.08251794 0.001630789
## 5 0.0017074559 0.012805919 0.03063935 0.014039082
## 6 0.0060501296 0.015989628 0.01858254 0.004321521
## 7 0.0002748008 0.007694422 0.02500687 0.012778236
## education_Assoc-acdm education_Assoc-voc education_Bachelors
## 1 0.05584483 0.005727675 0.56873210
## 2 0.02515400 0.015143737 0.07828542
## 3 0.01455381 0.074492532 0.00000000
## 4 0.00000000 0.000000000 0.00000000
## 5 0.04287611 0.055207740 0.17804971
## 6 0.03154710 0.032411409 0.23811582
## 7 0.04163232 0.057845562 0.06430338
## education_Doctorate education_HS-grad education_Masters education_Preschool
## 1 0.0577974486 0.0000000 0.23535538 0.0000000000
## 2 0.0012833676 0.2109856 0.01129363 0.0002566735
## 3 0.0000000000 0.5520873 0.00000000 0.0000000000
## 4 0.0000000000 0.0000000 0.00000000 0.0228310502
## 5 0.0001897173 0.3852210 0.02361981 0.0000000000
## 6 0.0375972342 0.2476232 0.10414866 0.0004321521
## 7 0.0008244023 0.4892828 0.02363287 0.0000000000
## education_Prof-school education_Some-college marital-status_Divorced
## 1 0.076542567 0.0000000 0.09945327
## 2 0.002310062 0.4019507 0.02977413
## 3 0.000000000 0.3110877 0.01034087
## 4 0.000000000 0.0000000 0.11676451
## 5 0.001897173 0.2536521 0.09343578
## 6 0.068280035 0.1754538 0.10371651
## 7 0.001374004 0.2753504 0.51195383
## marital-status_Married-AF-spouse marital-status_Married-civ-spouse
## 1 0.0005206977 0.75878677
## 2 0.0012833676 0.07058522
## 3 0.0006702413 0.98113750
## 4 0.0003261579 0.55381605
## 5 0.0007588693 0.03367482
## 6 0.0008643042 0.65687122
## 7 0.0006870019 0.15539984
## marital-status_Married-spouse-absent marital-status_Never-married
## 1 0.010023431 0.103749024
## 2 0.008213552 0.871406571
## 3 0.001723478 0.001819226
## 4 0.031637312 0.182322244
## 5 0.012331626 0.819863404
## 6 0.007778738 0.191011236
## 7 0.024732069 0.102088486
## marital-status_Separated marital-status_Widowed occupation_Adm-clerical
## 1 0.01666233 0.010804478 0.04751367
## 2 0.01668378 0.002053388 0.15734086
## 3 0.00306396 0.001244734 0.05524703
## 4 0.05609915 0.059034573 0.02804958
## 5 0.03784861 0.002086891 0.17302220
## 6 0.01815039 0.021607606 0.08124460
## 7 0.07873042 0.126408354 0.25927453
## occupation_Armed-Forces occupation_Craft-repair occupation_Exec-managerial
## 1 0.0002603489 0.04074460 0.27636032
## 2 0.0002566735 0.03439425 0.02541068
## 3 0.0002872463 0.27317120 0.11959020
## 4 0.0000000000 0.20026093 0.03163731
## 5 0.0006640106 0.12141909 0.09618668
## 6 0.0004321521 0.11754538 0.21261884
## 7 0.0000000000 0.07611981 0.12503435
## occupation_Farming-fishing occupation_Handlers-cleaners
## 1 0.01523041 0.005988024
## 2 0.01899384 0.079825462
## 3 0.05141708 0.043182689
## 4 0.09784736 0.092628832
## 5 0.02731929 0.067254790
## 6 0.02117545 0.023768366
## 7 0.01580104 0.026106073
## occupation_Machine-op-inspct occupation_Other-service
## 1 0.008461338 0.01562093
## 2 0.022843943 0.31108830
## 3 0.091248564 0.04595940
## 4 0.167971298 0.18460535
## 5 0.074464049 0.11686587
## 6 0.042783060 0.04494382
## 7 0.063753779 0.15031602
## occupation_Priv-house-serv occupation_Prof-specialty
## 1 0.0002603489 0.403150221
## 2 0.0084702259 0.078028747
## 3 0.0003829950 0.038395251
## 4 0.0205479452 0.008806262
## 5 0.0037943464 0.097040410
## 6 0.0021607606 0.223422645
## 7 0.0116790327 0.087798846
## occupation_Protective-serv occupation_Sales occupation_Tech-support
## 1 0.019265816 0.12796147 0.028638375
## 2 0.008983573 0.20944559 0.026180698
## 3 0.036384527 0.11145155 0.029107622
## 4 0.011089367 0.04761905 0.003587736
## 5 0.022196927 0.11715045 0.042212104
## 6 0.020311150 0.13396716 0.031979257
## 7 0.013465238 0.10359989 0.036273702
## occupation_Transport-moving relationship_Husband relationship_Not-in-family
## 1 0.01054413 0.6851080448 0.178989846
## 2 0.01873717 0.0236139630 0.159650924
## 3 0.10417465 0.9716583684 0.009574876
## 4 0.10534899 0.4983692107 0.216894977
## 5 0.04040979 0.0001897173 0.507209258
## 6 0.04364736 0.5916162489 0.221261884
## 7 0.03077769 0.0362737016 0.423330585
## relationship_Other-relative relationship_Own-child relationship_Unmarried
## 1 0.007550117 0.01431919 0.045561052
## 2 0.054158111 0.66298768 0.061601643
## 3 0.004595940 0.00382995 0.005744925
## 4 0.064579256 0.05512068 0.125244618
## 5 0.051128818 0.31274900 0.102542212
## 6 0.015989628 0.06352636 0.046240277
## 7 0.035449299 0.03847211 0.352569387
## relationship_Wife race_Amer-Indian-Eskimo race_Asian-Pac-Islander race_Black
## 1 0.06847175 0.003514710 0.04699297 0.04113512
## 2 0.03798768 0.008726899 0.03182752 0.09496920
## 3 0.00459594 0.010053619 0.01895825 0.05591727
## 4 0.03979126 0.010763209 0.02315721 0.11904762
## 5 0.02618099 0.013280212 0.03054449 0.12919750
## 6 0.06136560 0.004321521 0.03197926 0.06136560
## 7 0.11390492 0.011816433 0.02102226 0.14962902
## race_Other race_White sex_Female sex_Male native-country_Cambodia
## 1 0.004425931 0.9039313 0.155688623 0.8443114 0.0005206977
## 2 0.008983573 0.8554928 0.575718686 0.4242813 0.0000000000
## 3 0.005170433 0.9099004 0.005840674 0.9941593 0.0008617388
## 4 0.024461840 0.8225701 0.231898239 0.7681018 0.0009784736
## 5 0.010339594 0.8166382 0.420508442 0.5794916 0.0004742933
## 6 0.006050130 0.8962835 0.215211755 0.7847882 0.0008643042
## 7 0.004396812 0.8131355 0.762984336 0.2370157 0.0004122012
## native-country_Canada native-country_China native-country_Columbia
## 1 0.005597501 0.005857850 0.001171570
## 2 0.003336756 0.002053388 0.002053388
## 3 0.002680965 0.001244734 0.001436231
## 4 0.003913894 0.003913894 0.003913894
## 5 0.002371467 0.001138304 0.001897173
## 6 0.005617978 0.005185825 0.001296456
## 7 0.003984611 0.001511404 0.002061006
## native-country_Cuba native-country_Dominican-Republic native-country_Ecuador
## 1 0.003254361 0.0007810466 0.0007810466
## 2 0.000513347 0.0005133470 0.0010266940
## 3 0.002489468 0.0013404826 0.0008617388
## 4 0.009132420 0.0130463144 0.0022831050
## 5 0.001517739 0.0021817492 0.0012331626
## 6 0.002160761 0.0012964564 0.0000000000
## 7 0.004259412 0.0012366035 0.0005496015
## native-country_El-Salvador native-country_England native-country_France
## 1 0.0013017443 0.005467326 0.0022129654
## 2 0.0041067762 0.002566735 0.0005133470
## 3 0.0016277288 0.001244734 0.0001914975
## 4 0.0215264188 0.001304631 0.0003261579
## 5 0.0024663252 0.001992032 0.0006640106
## 6 0.0008643042 0.003025065 0.0000000000
## 7 0.0013740038 0.003022808 0.0009618027
## native-country_Germany native-country_Greece native-country_Guatemala
## 1 0.006118198 0.0010413955 0.0000000000
## 2 0.003080082 0.0010266940 0.0015400411
## 3 0.004117196 0.0017234776 0.0009574876
## 4 0.001630789 0.0019569472 0.0143509459
## 5 0.004173781 0.0004742933 0.0018971732
## 6 0.003025065 0.0017286085 0.0004321521
## 7 0.004809013 0.0005496015 0.0006870019
## native-country_Haiti native-country_Holand-Netherlands
## 1 0.0005206977 0.0000000000
## 2 0.0015400411 0.0000000000
## 3 0.0010532363 0.0000000000
## 4 0.0048923679 0.0000000000
## 5 0.0015177386 0.0000000000
## 6 0.0012964564 0.0004321521
## 7 0.0019236054 0.0000000000
## native-country_Honduras native-country_Hong native-country_Hungary
## 1 0.0001301744 0.0009112210 0.0006508722
## 2 0.0005133470 0.0000000000 0.0002566735
## 3 0.0000000000 0.0002872463 0.0004787438
## 4 0.0016307893 0.0013046314 0.0000000000
## 5 0.0002845760 0.0007588693 0.0001897173
## 6 0.0008643042 0.0017286085 0.0004321521
## 7 0.0008244023 0.0002748008 0.0005496015
## native-country_India native-country_Iran native-country_Ireland
## 1 0.0098932570 0.0039052330 0.0005206977
## 2 0.0043634497 0.0007700205 0.0000000000
## 3 0.0010532363 0.0004787438 0.0007659900
## 4 0.0016307893 0.0000000000 0.0006523157
## 5 0.0018023146 0.0009485866 0.0015177386
## 6 0.0056179775 0.0017286085 0.0008643042
## 7 0.0008244023 0.0005496015 0.0005496015
## native-country_Italy native-country_Jamaica native-country_Japan
## 1 0.0026034887 0.0011715699 0.0045561052
## 2 0.0012833676 0.0020533881 0.0012833676
## 3 0.0024894676 0.0018192264 0.0007659900
## 4 0.0088062622 0.0022831050 0.0006523157
## 5 0.0007588693 0.0036046291 0.0021817492
## 6 0.0012964564 0.0004321521 0.0017286085
## 7 0.0015114042 0.0028854081 0.0016488046
## native-country_Laos native-country_Mexico native-country_Nicaragua
## 1 0.0002603489 0.004946628 0.0003905233
## 2 0.0000000000 0.010523614 0.0020533881
## 3 0.0005744925 0.011489851 0.0006702413
## 4 0.0016307893 0.163078930 0.0019569472
## 5 0.0004742933 0.014797951 0.0014228799
## 6 0.0004321521 0.007346586 0.0008643042
## 7 0.0002748008 0.004259412 0.0009618027
## native-country_Outlying-US(Guam-USVI-etc) native-country_Peru
## 1 0.0001301744 0.0007810466
## 2 0.0000000000 0.0010266940
## 3 0.0001914975 0.0006702413
## 4 0.0006523157 0.0013046314
## 5 0.0009485866 0.0014228799
## 6 0.0000000000 0.0004321521
## 7 0.0009618027 0.0010992031
## native-country_Philippines native-country_Poland native-country_Portugal
## 1 0.009893257 0.0016922676 0.0002603489
## 2 0.004363450 0.0010266940 0.0005133470
## 3 0.004212945 0.0022022214 0.0017234776
## 4 0.007175473 0.0019569472 0.0091324201
## 5 0.006165813 0.0015177386 0.0007588693
## 6 0.006482282 0.0008643042 0.0000000000
## 7 0.006045617 0.0023358065 0.0005496015
## native-country_Puerto-Rico native-country_Scotland native-country_South
## 1 0.001562093 0.0003905233 0.0033845353
## 2 0.002823409 0.0002566735 0.0030800821
## 3 0.002680965 0.0003829950 0.0019149751
## 4 0.014024788 0.0003261579 0.0003261579
## 5 0.003509770 0.0004742933 0.0020868905
## 6 0.003457217 0.0000000000 0.0025929127
## 7 0.004946414 0.0008244023 0.0019236054
## native-country_Taiwan native-country_Thailand native-country_Trinadad&Tobago
## 1 0.0035147097 0.0010413955 0.0003905233
## 2 0.0007700205 0.0005133470 0.0002566735
## 3 0.0003829950 0.0006702413 0.0004787438
## 4 0.0000000000 0.0003261579 0.0013046314
## 5 0.0014228799 0.0004742933 0.0005691520
## 6 0.0017286085 0.0004321521 0.0008643042
## 7 0.0002748008 0.0006870019 0.0006870019
## native-country_United-States native-country_Vietnam native-country_Yugoslavia
## 1 0.9107003 0.001041395 0.0006508722
## 2 0.9366016 0.001796715 0.0000000000
## 3 0.9395825 0.001627729 0.0005744925
## 4 0.6924331 0.002935421 0.0013046314
## 5 0.9245874 0.002845760 0.0004742933
## 6 0.9308557 0.001728608 0.0000000000
## 7 0.9356966 0.001099203 0.0004122012
glance(kclust)
fviz_cluster(kclust, income_2, ellipse.type = "norm", repel = TRUE, ggtheme = theme_minimal())
inc <- augment(kclust,income_2)
head(inc)
inc %>%
pivot_longer(c(education_nums, ages),names_to = "feature") %>%
ggplot(aes(value, fill=.cluster))+
geom_density(alpha=0.3)+
theme(strip.background = element_blank(),
panel.background = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_blank(),
plot.title = element_text(),
plot.background = element_blank())+
facet_wrap(~feature, labeller = labeller(feature =
c("education_nums" = "Years of Education",
"ages" = "Age")))+
labs(fill = "Clusters")
set.seed(504)
pr_income <- prcomp(x = select(income,-income_above_50k), scale = T, center = T)
summary(pr_income)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.13511 1.76172 1.61592 1.53271 1.39315 1.33405 1.30566
## Proportion of Variance 0.04383 0.02984 0.02511 0.02259 0.01866 0.01711 0.01639
## Cumulative Proportion 0.04383 0.07368 0.09878 0.12137 0.14004 0.15715 0.17354
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.25584 1.19933 1.18169 1.1494 1.13792 1.1170 1.10488
## Proportion of Variance 0.01516 0.01383 0.01343 0.0127 0.01245 0.0120 0.01174
## Cumulative Proportion 0.18870 0.20253 0.21596 0.2287 0.24112 0.2531 0.26485
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 1.09901 1.09042 1.0841 1.08221 1.06747 1.06548 1.05522
## Proportion of Variance 0.01161 0.01143 0.0113 0.01126 0.01096 0.01092 0.01071
## Cumulative Proportion 0.27646 0.28790 0.2992 0.31046 0.32141 0.33233 0.34304
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 1.05174 1.04434 1.04271 1.03857 1.03821 1.03696 1.03227
## Proportion of Variance 0.01064 0.01049 0.01045 0.01037 0.01036 0.01034 0.01025
## Cumulative Proportion 0.35367 0.36416 0.37462 0.38499 0.39535 0.40569 0.41594
## PC29 PC30 PC31 PC32 PC33 PC34 PC35
## Standard deviation 1.03078 1.02803 1.0251 1.02333 1.02189 1.01949 1.01715
## Proportion of Variance 0.01022 0.01016 0.0101 0.01007 0.01004 0.00999 0.00995
## Cumulative Proportion 0.42615 0.43631 0.4464 0.45649 0.46653 0.47652 0.48647
## PC36 PC37 PC38 PC39 PC40 PC41 PC42
## Standard deviation 1.01566 1.01413 1.01171 1.0098 1.00831 1.00479 1.00383
## Proportion of Variance 0.00992 0.00989 0.00984 0.0098 0.00978 0.00971 0.00969
## Cumulative Proportion 0.49639 0.50628 0.51612 0.5259 0.53570 0.54541 0.55510
## PC43 PC44 PC45 PC46 PC47 PC48 PC49
## Standard deviation 1.00217 1.00194 1.00182 1.00146 1.00120 1.00097 1.00075
## Proportion of Variance 0.00966 0.00965 0.00965 0.00964 0.00964 0.00963 0.00963
## Cumulative Proportion 0.56475 0.57441 0.58406 0.59370 0.60334 0.61297 0.62260
## PC50 PC51 PC52 PC53 PC54 PC55 PC56
## Standard deviation 1.00052 1.00018 1.00015 1.00005 0.99978 0.9993 0.99878
## Proportion of Variance 0.00963 0.00962 0.00962 0.00962 0.00961 0.0096 0.00959
## Cumulative Proportion 0.63223 0.64185 0.65147 0.66108 0.67069 0.6803 0.68989
## PC57 PC58 PC59 PC60 PC61 PC62 PC63
## Standard deviation 0.99798 0.99728 0.99662 0.99488 0.99467 0.99372 0.99318
## Proportion of Variance 0.00958 0.00956 0.00955 0.00952 0.00951 0.00949 0.00948
## Cumulative Proportion 0.69946 0.70903 0.71858 0.72809 0.73761 0.74710 0.75659
## PC64 PC65 PC66 PC67 PC68 PC69 PC70
## Standard deviation 0.99052 0.98995 0.98801 0.98608 0.98538 0.98314 0.98156
## Proportion of Variance 0.00943 0.00942 0.00939 0.00935 0.00934 0.00929 0.00926
## Cumulative Proportion 0.76602 0.77544 0.78483 0.79418 0.80352 0.81281 0.82207
## PC71 PC72 PC73 PC74 PC75 PC76 PC77
## Standard deviation 0.98094 0.97880 0.97661 0.96924 0.96591 0.95795 0.95564
## Proportion of Variance 0.00925 0.00921 0.00917 0.00903 0.00897 0.00882 0.00878
## Cumulative Proportion 0.83133 0.84054 0.84971 0.85874 0.86771 0.87654 0.88532
## PC78 PC79 PC80 PC81 PC82 PC83 PC84
## Standard deviation 0.95375 0.94992 0.93892 0.93181 0.92806 0.91281 0.89884
## Proportion of Variance 0.00875 0.00868 0.00848 0.00835 0.00828 0.00801 0.00777
## Cumulative Proportion 0.89407 0.90274 0.91122 0.91957 0.92785 0.93586 0.94363
## PC85 PC86 PC87 PC88 PC89 PC90 PC91
## Standard deviation 0.89134 0.85504 0.85248 0.81285 0.78871 0.77356 0.73836
## Proportion of Variance 0.00764 0.00703 0.00699 0.00635 0.00598 0.00575 0.00524
## Cumulative Proportion 0.95127 0.95830 0.96529 0.97164 0.97762 0.98337 0.98862
## PC92 PC93 PC94 PC95 PC96 PC97
## Standard deviation 0.69594 0.65642 0.49845 0.14222 1.69e-14 8.356e-15
## Proportion of Variance 0.00466 0.00414 0.00239 0.00019 0.00e+00 0.000e+00
## Cumulative Proportion 0.99327 0.99742 0.99981 1.00000 1.00e+00 1.000e+00
## PC98 PC99 PC100 PC101 PC102
## Standard deviation 6.635e-15 6.123e-15 5.397e-15 4.749e-15 4.24e-15
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.00e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.00e+00
## PC103 PC104
## Standard deviation 3.738e-15 2.032e-15
## Proportion of Variance 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00
screeplot(pr_income, type = "lines")
# find features w/ highest loadings per factor
rownames_to_column(as.data.frame(pr_income$rotation)) %>%
select(1:5) %>%
filter(abs(PC1) >= 0.25 | abs(PC2) >= 0.25 | abs(PC3) >= 0.25 | abs(PC4) >= 0.25)
prc <- bind_cols(select(income,income_above_50k),as.data.frame(pr_income$x)) %>%
select(1:5) %>%
rename("bachelorettes"=PC1, "high_education"=PC2, "white_usa"=PC3, "divorced_no_kids"=PC4)
prc %>%
select(income_above_50k, bachelorettes, high_education,
white_usa, divorced_no_kids) %>%
pivot_longer(cols = -income_above_50k, names_to = "component",values_to = "loading") %>%
ggplot(aes(loading, fill= income_above_50k))+
geom_density(alpha=0.4)+
theme(strip.background = element_blank(),
panel.background = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_blank(),
plot.title = element_text())+
scale_fill_manual(values = c("#7052F5", "#50E6D7"),
name = "Income Above $50k")+
facet_wrap(.~component, labeller = labeller(component =
c("divorced_no_kids" = "Mature Female Divorcée",
"high_education" = "Highly Educated American Women",
"bachelorettes" = " Young Low Educated Bachelorettes",
"white_usa" = "Young White Americans")))
biplot(pr_income, choices = c(1,2),
col = c("grey90", "#50E6D7"),
xlim=c(-0.020, 0.020), ylim=c(-0.006, 0.006))
income_model_whtmen <- income_2 %>%
select(income_above_50k, education_nums, education_Bachelors, education_Masters, education_Doctorate, `occupation_Prof-specialty`, `native-country_United-States`, sex_Male, `marital-status_Married-civ-spouse`, relationship_Husband, workclass_Private, race_White)
income_model_sglwomen <- income_2 %>%
select(income_above_50k, ages, sex_Female, education_nums, education_Masters, education_Doctorate, `occupation_Prof-specialty`, `occupation_Exec-managerial`, `native-country_United-States`, `marital-status_Never-married`, `marital-status_Divorced`, `relationship_Not-in-family`)
income_model_whtmen$income_above_50k <- as.factor(income_model_whtmen$income_above_50k)
ctrl <- trainControl(method = "cv", number = 3, classProbs=TRUE, summaryFunction = twoClassSummary)
set.seed(504)
income_index <- createDataPartition(income_model_whtmen$income_above_50k, p = 0.80, list = FALSE)
train <- income_model_whtmen[ income_index, ]
test <- income_model_whtmen[-income_index, ]
# example spec for rf
fit <- train(make.names(income_above_50k) ~ .,
data = train,
method = "rf",
tuneLength = 3,
metric = "ROC",
trControl = ctrl)
fit
## Random Forest
##
## 36179 samples
## 11 predictor
## 2 classes: 'FALSE.', 'TRUE.'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 24119, 24119, 24120
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.7838102 0.9478172 0.4142969
## 6 0.7449016 0.9359106 0.4528828
## 11 0.7510866 0.9323461 0.4574551
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
confusionMatrix(predict(fit, test),factor(make.names(test$income_above_50k)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE. TRUE.
## FALSE. 6432 1281
## TRUE. 370 960
##
## Accuracy : 0.8174
## 95% CI : (0.8093, 0.8253)
## No Information Rate : 0.7522
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.433
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9456
## Specificity : 0.4284
## Pos Pred Value : 0.8339
## Neg Pred Value : 0.7218
## Prevalence : 0.7522
## Detection Rate : 0.7113
## Detection Prevalence : 0.8529
## Balanced Accuracy : 0.6870
##
## 'Positive' Class : FALSE.
##
library(pROC)
myRoc <- roc(test$income_above_50k, predict(fit, test, type="prob")[,2])
plot(myRoc)
auc(myRoc)
## Area under the curve: 0.7771
income_model_sglwomen$income_above_50k <- as.factor(income_model_sglwomen$income_above_50k)
ctrl2 <- trainControl(method = "cv", number = 3, classProbs=TRUE, summaryFunction = twoClassSummary)
set.seed(504)
income_index2 <- createDataPartition(income_model_sglwomen$income_above_50k, p = 0.80, list = FALSE)
train2 <- income_model_sglwomen[ income_index2, ]
test2 <- income_model_sglwomen[-income_index2, ]
# example spec for rf
fit2 <- train(make.names(income_above_50k) ~ .,
data = train2,
method = "rf",
tuneLength = 3,
metric = "ROC",
trControl = ctrl2)
fit2
## Random Forest
##
## 36179 samples
## 11 predictor
## 2 classes: 'FALSE.', 'TRUE.'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 24119, 24119, 24120
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.7997318 0.9414966 0.4500948
## 6 0.8163877 0.9173524 0.5203524
## 11 0.8113665 0.9066219 0.4931415
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 6.
confusionMatrix(predict(fit2, test2),factor(make.names(test2$income_above_50k)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE. TRUE.
## FALSE. 6214 1051
## TRUE. 588 1190
##
## Accuracy : 0.8188
## 95% CI : (0.8107, 0.8266)
## No Information Rate : 0.7522
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4777
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9136
## Specificity : 0.5310
## Pos Pred Value : 0.8553
## Neg Pred Value : 0.6693
## Prevalence : 0.7522
## Detection Rate : 0.6872
## Detection Prevalence : 0.8034
## Balanced Accuracy : 0.7223
##
## 'Positive' Class : FALSE.
##
myRoc2 <- roc(test2$income_above_50k, predict(fit2, test2, type="prob")[,2])
plot(myRoc2)
auc(myRoc2)
## Area under the curve: 0.8069
print(fit2)
## Random Forest
##
## 36179 samples
## 11 predictor
## 2 classes: 'FALSE.', 'TRUE.'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 24119, 24119, 24120
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.7997318 0.9414966 0.4500948
## 6 0.8163877 0.9173524 0.5203524
## 11 0.8113665 0.9066219 0.4931415
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 6.
print(fit2$bestTune)
## mtry
## 2 6
set.seed(504)
fit_final <- train(make.names(income_above_50k) ~ .,
data = train2,
method = "rf",
trControl = ctrl2,
tuneLength = fit2$bestTune)
myRoc3 <- roc(test2$income_above_50k, predict(fit_final, test2, type="prob")[,2])
plot(myRoc3)
auc(myRoc3)
## Area under the curve: 0.8093
confusionMatrix(predict(fit_final, test2),factor(make.names(test2$income_above_50k)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE. TRUE.
## FALSE. 6164 1050
## TRUE. 638 1191
##
## Accuracy : 0.8133
## 95% CI : (0.8051, 0.8213)
## No Information Rate : 0.7522
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4664
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9062
## Specificity : 0.5315
## Pos Pred Value : 0.8544
## Neg Pred Value : 0.6512
## Prevalence : 0.7522
## Detection Rate : 0.6816
## Detection Prevalence : 0.7977
## Balanced Accuracy : 0.7188
##
## 'Positive' Class : FALSE.
##