ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
packages <- c("dplyr","tidyr","ggplot2","ROCR","corrplot","sparklyr","caret","missForest",
"doParallel","R.utils","glmnet","coefplot","knitr","GGally")
ipak(packages)
## dplyr tidyr ggplot2 ROCR corrplot sparklyr caret
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## missForest doParallel R.utils glmnet coefplot knitr GGally
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE
temp <- tempfile()
if(!file.exists("./census-income.data.gz")){
fileUrl <- "https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income.data.gz"
download.file(fileUrl, destfile = "./census-income.data.gz", mode="wb")
}
if(!file.exists("census-income.data")){
gunzip("census-income.data.gz","census-income.data",remove=F)
}
rm(fileUrl)
temp <- tempfile()
if(!file.exists("./census-income.test.gz")){
fileUrl2 <- "https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income.test.gz"
download.file(fileUrl2, destfile = "./census-income.test.gz", mode="wb")
}
if(!file.exists("census-income.test")){
gunzip("census-income.test.gz","census-income.test",remove=F)
}
rm(fileUrl2)
df <- read.table("census-income.data", sep = ','
,stringsAsFactors = TRUE ,na.strings = c(" ?", " Not in universe"))
df_test <- read.table("census-income.test", sep = ','
,stringsAsFactors = TRUE,na.strings = c(" ?", " Not in universe"))
colns <- c("Age","Class_of_Worker","Industry_code","occupation_code",
"education","wage_per_hour","enrolled_in_edu_inst_last wk",
"marital_status","major_industry_code","major_occupation_code","race",
"hispanic_Origin","sex","member_of_a_labor_union","reason_for_unemployment",
"full_part_time_employment_stat","capital_gains","capital_losses",
"divdends_from_stocks","tax_filer_status","region_of_previous_residence",
"state_of_previous_residence","detailed_household_and_family_stat",
"detailed_household_summary_in_household","instance_weight",
"migration_code-change_msa","migration_code-change_reg",
"migration_code-move_within_reg","live_in_house_1_year_ago",
"migration_prev_res_sunbelt","num_persons_worked_for_employer",
"family_members_under 18","country_birth_father","country_birth_mother",
"country_birth_self","citizenship","ownbusiness_or_self_employed",
"fillinc_questionnaire_for_veterans_admin","veteran_benefits" ,
"Weeks_worked_in_year", "Year", "Salary")
colnames(df) <- colns
colnames(df_test) <- colns
head(df,3)
kable(str(df))
## 'data.frame': 199523 obs. of 42 variables:
## $ Age : int 73 58 18 9 10 48 42 28 47 34 ...
## $ Class_of_Worker : Factor w/ 8 levels " Federal government",..: NA 6 NA NA NA 4 4 4 2 4 ...
## $ Industry_code : int 0 4 0 0 0 40 34 4 43 4 ...
## $ occupation_code : int 0 34 0 0 0 10 3 40 26 37 ...
## $ education : Factor w/ 17 levels " 10th grade",..: 13 17 1 11 11 17 10 13 17 17 ...
## $ wage_per_hour : int 0 0 0 0 0 1200 0 0 876 0 ...
## $ enrolled_in_edu_inst_last wk : Factor w/ 2 levels " College or university",..: NA NA 2 NA NA NA NA NA NA NA ...
## $ marital_status : Factor w/ 7 levels " Divorced"," Married-A F spouse present",..: 7 1 5 5 5 3 3 5 3 3 ...
## $ major_industry_code : Factor w/ 24 levels " Agriculture",..: 15 5 15 15 15 7 8 5 6 5 ...
## $ major_occupation_code : Factor w/ 14 levels " Adm support including clerical",..: NA 8 NA NA NA 10 3 5 1 6 ...
## $ race : Factor w/ 5 levels " Amer Indian Aleut or Eskimo",..: 5 5 2 5 5 1 5 5 5 5 ...
## $ hispanic_Origin : Factor w/ 10 levels " All other"," Central or South American",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels " Female"," Male": 1 2 1 1 1 1 2 1 1 2 ...
## $ member_of_a_labor_union : Factor w/ 2 levels " No"," Yes": NA NA NA NA NA 1 NA NA 1 NA ...
## $ reason_for_unemployment : Factor w/ 5 levels " Job leaver",..: NA NA NA NA NA NA NA 2 NA NA ...
## $ full_part_time_employment_stat : Factor w/ 8 levels " Children or Armed Forces",..: 3 1 3 1 1 2 1 7 2 1 ...
## $ capital_gains : int 0 0 0 0 0 0 5178 0 0 0 ...
## $ capital_losses : int 0 0 0 0 0 0 0 0 0 0 ...
## $ divdends_from_stocks : int 0 0 0 0 0 0 0 0 0 0 ...
## $ tax_filer_status : Factor w/ 6 levels " Head of household",..: 5 1 5 5 5 3 3 6 3 3 ...
## $ region_of_previous_residence : Factor w/ 5 levels " Abroad"," Midwest",..: NA 4 NA NA NA NA NA NA NA NA ...
## $ state_of_previous_residence : Factor w/ 49 levels " Abroad"," Alabama",..: NA 5 NA NA NA NA NA NA NA NA ...
## $ detailed_household_and_family_stat : Factor w/ 38 levels " Child <18 ever marr not in subfamily",..: 30 21 8 3 3 37 21 36 37 21 ...
## $ detailed_household_summary_in_household : Factor w/ 8 levels " Child 18 or older",..: 7 5 1 3 3 8 5 6 8 5 ...
## $ instance_weight : num 1700 1054 992 1758 1069 ...
## $ migration_code-change_msa : Factor w/ 8 levels " Abroad to MSA",..: NA 3 NA 5 5 NA 5 NA NA 5 ...
## $ migration_code-change_reg : Factor w/ 7 levels " Abroad"," Different county same state",..: NA 7 NA 6 6 NA 6 NA NA 6 ...
## $ migration_code-move_within_reg : Factor w/ 8 levels " Abroad"," Different county same state",..: NA 8 NA 7 7 NA 7 NA NA 7 ...
## $ live_in_house_1_year_ago : Factor w/ 3 levels " No"," Not in universe under 1 year old",..: 2 1 2 3 3 2 3 2 2 3 ...
## $ migration_prev_res_sunbelt : Factor w/ 2 levels " No"," Yes": NA 2 NA NA NA NA NA NA NA NA ...
## $ num_persons_worked_for_employer : int 0 1 0 0 0 1 6 4 5 6 ...
## $ family_members_under 18 : Factor w/ 4 levels " Both parents present",..: NA NA NA 1 1 NA NA NA NA NA ...
## $ country_birth_father : Factor w/ 42 levels " Cambodia"," Canada",..: 40 40 41 40 40 31 40 40 40 40 ...
## $ country_birth_mother : Factor w/ 42 levels " Cambodia"," Canada",..: 40 40 41 40 40 40 40 40 40 40 ...
## $ country_birth_self : Factor w/ 42 levels " Cambodia"," Canada",..: 40 40 41 40 40 40 40 40 40 40 ...
## $ citizenship : Factor w/ 5 levels " Foreign born- Not a citizen of U S ",..: 5 5 1 5 5 5 5 5 5 5 ...
## $ ownbusiness_or_self_employed : int 0 0 0 0 0 2 0 0 0 0 ...
## $ fillinc_questionnaire_for_veterans_admin: Factor w/ 2 levels " No"," Yes": NA NA NA NA NA NA NA NA NA NA ...
## $ veteran_benefits : int 2 2 2 0 0 2 2 2 2 2 ...
## $ Weeks_worked_in_year : int 0 52 0 0 0 52 52 30 52 52 ...
## $ Year : int 95 94 95 94 94 95 94 95 95 94 ...
## $ Salary : Factor w/ 2 levels " - 50000."," 50000+.": 1 1 1 1 1 1 1 1 1 1 ...
|| || || ||
summary(df)
## Age Class_of_Worker Industry_code
## Min. : 0.00 Private : 72028 Min. : 0.00
## 1st Qu.:15.00 Self-employed-not incorporated: 8445 1st Qu.: 0.00
## Median :33.00 Local government : 7784 Median : 0.00
## Mean :34.49 State government : 4227 Mean :15.35
## 3rd Qu.:50.00 Self-employed-incorporated : 3265 3rd Qu.:33.00
## Max. :90.00 (Other) : 3529 Max. :51.00
## NA's :100245
## occupation_code education wage_per_hour
## Min. : 0.00 High school graduate :48407 Min. : 0.00
## 1st Qu.: 0.00 Children :47422 1st Qu.: 0.00
## Median : 0.00 Some college but no degree:27820 Median : 0.00
## Mean :11.31 Bachelors degree(BA AB BS):19865 Mean : 55.43
## 3rd Qu.:26.00 7th and 8th grade : 8007 3rd Qu.: 0.00
## Max. :46.00 10th grade : 7557 Max. :9999.00
## (Other) :40445
## enrolled_in_edu_inst_last wk marital_status
## College or university: 5688 Divorced :12710
## High school : 6892 Married-A F spouse present : 665
## NA's :186943 Married-civilian spouse present:84222
## Married-spouse absent : 1518
## Never married :86485
## Separated : 3460
## Widowed :10463
## major_industry_code
## Not in universe or children :100684
## Retail trade : 17070
## Manufacturing-durable goods : 9015
## Education : 8283
## Manufacturing-nondurable goods : 6897
## Finance insurance and real estate: 6145
## (Other) : 51429
## major_occupation_code
## Adm support including clerical: 14837
## Professional specialty : 13940
## Executive admin and managerial: 12495
## Other service : 12099
## Sales : 11783
## (Other) : 33685
## NA's :100684
## race hispanic_Origin
## Amer Indian Aleut or Eskimo: 2251 All other :171907
## Asian or Pacific Islander : 5835 Mexican-American : 8079
## Black : 20415 Mexican (Mexicano) : 7234
## Other : 3657 Central or South American: 3895
## White :167365 Puerto Rican : 3313
## Other Spanish : 2485
## (Other) : 2610
## sex member_of_a_labor_union reason_for_unemployment
## Female:103984 No : 16034 Job leaver : 598
## Male : 95539 Yes: 3030 Job loser - on layoff: 976
## NA's:180459 New entrant : 439
## Other job loser : 2038
## Re-entrant : 2019
## NA's :193453
##
## full_part_time_employment_stat capital_gains
## Children or Armed Forces :123769 Min. : 0.0
## Full-time schedules : 40736 1st Qu.: 0.0
## Not in labor force : 26808 Median : 0.0
## PT for non-econ reasons usually FT: 3322 Mean : 434.7
## Unemployed full-time : 2311 3rd Qu.: 0.0
## PT for econ reasons usually PT : 1209 Max. :99999.0
## (Other) : 1368
## capital_losses divdends_from_stocks tax_filer_status
## Min. : 0.00 Min. : 0.0 Head of household : 7426
## 1st Qu.: 0.00 1st Qu.: 0.0 Joint both 65+ : 8332
## Median : 0.00 Median : 0.0 Joint both under 65 :67383
## Mean : 37.31 Mean : 197.5 Joint one under 65 & one 65+: 3867
## 3rd Qu.: 0.00 3rd Qu.: 0.0 Nonfiler :75094
## Max. :4608.00 Max. :99999.0 Single :37421
##
## region_of_previous_residence state_of_previous_residence
## Abroad : 530 California : 1714
## Midwest : 3575 Utah : 1063
## Northeast: 2705 Florida : 849
## South : 4889 North Carolina: 812
## West : 4074 Abroad : 671
## NA's :183750 (Other) : 9956
## NA's :184458
## detailed_household_and_family_stat
## Householder :53248
## Child <18 never marr not in subfamily :50326
## Spouse of householder :41695
## Nonfamily householder :22213
## Child 18+ never marr Not in a subfamily:12030
## Secondary individual : 6122
## (Other) :13889
## detailed_household_summary_in_household instance_weight
## Householder :75475 Min. : 37.87
## Child under 18 never married :50426 1st Qu.: 1061.62
## Spouse of householder :41709 Median : 1618.31
## Child 18 or older :14430 Mean : 1740.38
## Other relative of householder: 9703 3rd Qu.: 2188.61
## Nonrelative of householder : 7601 Max. :18656.30
## (Other) : 179
## migration_code-change_msa migration_code-change_reg
## Nonmover : 82538 Nonmover : 82538
## MSA to MSA : 10601 Same county : 9812
## NonMSA to nonMSA: 2811 Different county same state : 2797
## MSA to nonMSA : 790 Different region : 1178
## NonMSA to MSA : 615 Different state same division: 991
## (Other) : 956 (Other) : 995
## NA's :101212 NA's :101212
## migration_code-move_within_reg
## Nonmover : 82538
## Same county : 9812
## Different county same state: 2797
## Different state in South : 973
## Different state in West : 679
## (Other) : 1512
## NA's :101212
## live_in_house_1_year_ago migration_prev_res_sunbelt
## No : 15773 No : 9987
## Not in universe under 1 year old:101212 Yes: 5786
## Yes : 82538 NA's:183750
##
##
##
##
## num_persons_worked_for_employer family_members_under 18
## Min. :0.000 Both parents present : 38983
## 1st Qu.:0.000 Father only present : 1883
## Median :1.000 Mother only present : 12772
## Mean :1.956 Neither parent present: 1653
## 3rd Qu.:4.000 NA's :144232
## Max. :6.000
##
## country_birth_father country_birth_mother country_birth_self
## United-States:159163 United-States:160479 United-States:176989
## Mexico : 10008 Mexico : 9781 Mexico : 5767
## Puerto-Rico : 2680 Puerto-Rico : 2473 Puerto-Rico : 1400
## Italy : 2212 Italy : 1844 Germany : 851
## Canada : 1380 Canada : 1451 Philippines : 845
## (Other) : 17367 (Other) : 17376 (Other) : 10278
## NA's : 6713 NA's : 6119 NA's : 3393
## citizenship
## Foreign born- Not a citizen of U S : 13401
## Foreign born- U S citizen by naturalization: 5855
## Native- Born abroad of American Parent(s) : 1756
## Native- Born in Puerto Rico or U S Outlying: 1519
## Native- Born in the United States :176992
##
##
## ownbusiness_or_self_employed fillinc_questionnaire_for_veterans_admin
## Min. :0.0000 No : 1593
## 1st Qu.:0.0000 Yes: 391
## Median :0.0000 NA's:197539
## Mean :0.1754
## 3rd Qu.:0.0000
## Max. :2.0000
##
## veteran_benefits Weeks_worked_in_year Year Salary
## Min. :0.000 Min. : 0.00 Min. :94.0 - 50000.:187141
## 1st Qu.:2.000 1st Qu.: 0.00 1st Qu.:94.0 50000+. : 12382
## Median :2.000 Median : 8.00 Median :94.0
## Mean :1.515 Mean :23.17 Mean :94.5
## 3rd Qu.:2.000 3rd Qu.:52.00 3rd Qu.:95.0
## Max. :2.000 Max. :52.00 Max. :95.0
##
dim(df)
## [1] 199523 42
dim(df_test)
## [1] 99762 42
nas <- inspectdf::inspect_na(df) #count percentage of NA's
todrop<-unlist(nas %>%
filter(pcnt>50) %>%
select(col_name)) # filtering features with NA percentage bigger than 50
near_zero_var <- nearZeroVar(df, freqCut = 99/1)
near_zero_var <- names(df)[near_zero_var] # finding variables with almost zero variance
df <- df %>%
select(-c(todrop,near_zero_var))
df <- df %>%
select(-instance_weight) # removing weight feature, we not gonna use it for prediction purposes
df <- df %>% # same information with other variables
select(-major_industry_code,-detailed_household_and_family_stat)
df <- df %>% # most of values are "nofiler"
select(-tax_filer_status,-live_in_house_1_year_ago)
registerDoParallel(cores=3)
dfn<- df %>%
select(-Salary) %>%
missForest(maxiter=1,ntree=10, variablewise=T, parallelize = "forests")
## missForest iteration 1 in progress...done!
df[,-20] <- dfn$ximp
ggplot(df, aes(x=Salary))+ geom_bar(aes(fill=Salary))
ggplot(df, aes(x=Age))+ geom_histogram(aes(colour=I("black"), fill=I("blue")),bins=20)
ggplot(df, aes(x=race))+ geom_bar(aes( colour=I("black"), fill=race))
ggplot(df, aes(x=sex))+ geom_bar(aes( colour=I("black"), fill=sex))
ggplot(df, aes(x=citizenship))+ geom_bar(aes( colour=I("black"), fill=citizenship)) + coord_flip() + theme(legend.position = "none")
ggplot(df, aes(x=as.factor(marital_status)))+ geom_bar(aes( colour=I("black"), fill=marital_status))
ggplot(df, aes(x=education))+ geom_bar(aes( colour=I("black"), fill=education)) + coord_flip() + theme(legend.position = "none")
## Data correlations
boxplot(Age~Salary, data= df , main = "Age vs Salary",
xlab = "Salary", ylab = "Age", col = "orange")
As we can see from the boxplot the age has important correlation with Salary. It looks like that you need to be older to earn more than 50.000.
boxplot(num_persons_worked_for_employer~Salary, data= df , main = "Num of person worked for employer vs Salary",
xlab = "Salary", ylab = "Num of person worked for employer", col = "Green")
Working in a place with more employees seems to be important for Salary
boxplot(Weeks_worked_in_year~Salary, data= df , main = "Weeks worked vs Salary",
xlab = "Salary", ylab = "Weeks worked in a year", col = "red")
Here we can see that in our database females are more than males but the males have significant more people earning more than 50.000. So we can see there is a correlation between sex and salary
ggplot(df, aes(x=as.factor(Salary))) + geom_bar(stat = "count", aes(fill=sex)) + xlab("Salary") + ggtitle("Salary with Sex") + theme_classic()
Race is also very important. it seems to be rare to earn more than 50k if you are not white.
qplot (Salary, data = df, fill = race) + facet_grid (. ~ race)
You need to have veteran benefits if you want to earn more than 50k
qplot (Salary, data = df, fill = as.factor(veteran_benefits) ) + facet_grid (. ~ veteran_benefits)
qplot (Salary, data = df, fill = marital_status) + facet_grid (. ~ marital_status)
df_test <- df_test %>% select(-c(todrop,near_zero_var,instance_weight,major_industry_code,
detailed_household_and_family_stat,tax_filer_status,
live_in_house_1_year_ago))
registerDoParallel(cores=3)
dfnt<- df_test %>%
select(-Salary) %>%
missForest(maxiter=1,ntree=10, variablewise=T, parallelize = "forests")
## missForest iteration 1 in progress...done!
df_test[,-20] <- dfnt$ximp
df$train <- T
df_test$train <- F
df_total <- rbind(df,df_test)
df_total$Salary <- ifelse(df_total$Salary==" - 50000.",0,1)
df_total$race <- ifelse(df_total$race==" White", "White", "Not White")
df_total$hispanic_Origin <- ifelse(df_total$hispanic_Origin==" All other", 0, 1)
df_total$citizenship <- ifelse(df_total$citizenship %in%
c(" Native- Born abroad of American Parent(s)",
" Native- Born in Puerto Rico or U S Outlying",
" Native- Born in the United States",
" Foreign born- U S citizen by naturalization"),
"US citizenship", "No US citizenship")
levels(df_total$marital_status)[2:4] <- "Married"
levels(df_total$education)[levels(df_total$education) %in% c(" Children"," Less than 1st grade",
" 1st 2nd 3rd or 4th grade",
" 5th or 6th grade"," 7th and 8th grade",
" 9th grade"," 10th grade"," 11th grade",
" 12th grade no diploma" )] <- 0
levels(df_total$education)[levels(df_total$education) %in% c(" High school graduate",
" Some college but no degree")] <- 1
levels(df_total$education)[levels(df_total$education) %in%
c(" Associates degree-academic program",
" Prof school degree (MD DDS DVM LLB JD)",
" Associates degree-occup /vocational")] <- 2
levels(df_total$education)[levels(df_total$education)==" Bachelors degree(BA AB BS)"] <- 3
levels(df_total$education)[levels(df_total$education)==" Masters degree(MA MS MEng MEd MSW MBA)"] <- 4
levels(df_total$education)[levels(df_total$education)==" Doctorate degree(PhD EdD)"] <- 5
df_total <- df_total %>% mutate_if(is.character,factor)
vect <- c("Industry_code","occupation_code","num_persons_worked_for_employer",
"ownbusiness_or_self_employed","Weeks_worked_in_year","Year","veteran_benefits")
df_total <- df_total %>% mutate_if(names(df_total) %in% vect, as.factor)
df_train <- df_total %>% filter(train==T) %>% select(-train)
df_test <- df_total %>% filter(train==F) %>% select(-train)
sc <- spark_connect(master = "local")
dfs_train <- sdf_copy_to(sc,df_train)
dfs_test <- sdf_copy_to(sc,df_test)
GBT_model <- dfs_train %>%
ml_gbt_classifier(Salary~.)
GBT_pred <- ml_predict(GBT_model, dfs_test)
GBT_pred2 <- ml_binary_classification_evaluator(GBT_pred)
GBT_pred2
## [1] 0.9239043
cm <- table(pull(GBT_pred, label), pull(GBT_pred, prediction))
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 92964 612
## 1 4646 1540
##
## Accuracy : 0.9473
## 95% CI : (0.9459, 0.9487)
## No Information Rate : 0.9784
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3485
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9524
## Specificity : 0.7156
## Pos Pred Value : 0.9935
## Neg Pred Value : 0.2489
## Prevalence : 0.9784
## Detection Rate : 0.9319
## Detection Prevalence : 0.9380
## Balanced Accuracy : 0.8340
##
## 'Positive' Class : 0
##
featureImport <- ml_tree_feature_importance(GBT_model)
featureImport[1:10,] %>% ggplot(aes(reorder(feature, importance),importance,fill=feature)) +
geom_bar(stat = "identity") + coord_flip() + ggtitle("Top 10 feature importance") +
theme(legend.position="none") + ylab("importance") + xlab("feature")