Dataset can be obtained from https://archive.ics.uci.edu/ml/datasets/Adult and Prediction task is to determine whether a person makes over 50K a year.
# Load packages for analysis and this section will have all the required libraries mentioned for better clarity
library('lattice')
library('ggplot2') # visualization
## Warning: package 'ggplot2' was built under R version 3.4.1
library('ggthemes') # visualization
## Warning: package 'ggthemes' was built under R version 3.4.1
library('scales') # visualization
## Warning: package 'scales' was built under R version 3.4.1
library('dplyr') # data manipulation
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('mice') # imputation
## Warning: package 'mice' was built under R version 3.4.2
library('randomForest') # classification algorithm
## Warning: package 'randomForest' was built under R version 3.4.1
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library('rpart') # for decision tree
## Warning: package 'rpart' was built under R version 3.4.3
library('ROCR')
## Warning: package 'ROCR' was built under R version 3.4.1
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.1
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library('ROCR')
library('randomForest')
library('corrr')
## Warning: package 'corrr' was built under R version 3.4.1
library('corrplot')
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
library('glue')
## Warning: package 'glue' was built under R version 3.4.2
##
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
##
## collapse
library('caTools')
## Warning: package 'caTools' was built under R version 3.4.1
library('data.table')
## Warning: package 'data.table' was built under R version 3.4.2
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require("GGally")
## Loading required package: GGally
## Warning: package 'GGally' was built under R version 3.4.3
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
require("geosphere")
## Loading required package: geosphere
## Warning: package 'geosphere' was built under R version 3.4.2
require("gmapsdistance")
## Loading required package: gmapsdistance
## Warning: package 'gmapsdistance' was built under R version 3.4.2
require("tidyr")
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.4.2
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:mice':
##
## complete
library('corrplot')
#source("distance.R")
library('car')
## Warning: package 'car' was built under R version 3.4.2
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library('caret')
## Warning: package 'caret' was built under R version 3.4.3
library('gclus')
## Warning: package 'gclus' was built under R version 3.4.1
## Loading required package: cluster
## Warning: package 'cluster' was built under R version 3.4.2
library('visdat')
## Warning: package 'visdat' was built under R version 3.4.1
library('psych')
## Warning: package 'psych' was built under R version 3.4.2
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:randomForest':
##
## outlier
## The following objects are masked from 'package:scales':
##
## alpha, rescale
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library('leaflet')
## Warning: package 'leaflet' was built under R version 3.4.1
library('leaflet.extras')
## Warning: package 'leaflet.extras' was built under R version 3.4.1
# library("PerformanceAnalytics")
library('GPArotation')
## Warning: package 'GPArotation' was built under R version 3.4.1
library('MVN')
## Warning: package 'MVN' was built under R version 3.4.2
## sROC 0.1-2 loaded
##
## Attaching package: 'MVN'
## The following object is masked from 'package:psych':
##
## mardia
library('psych')
library('MASS')
## Warning: package 'MASS' was built under R version 3.4.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library('psy')
## Warning: package 'psy' was built under R version 3.4.1
##
## Attaching package: 'psy'
## The following object is masked from 'package:psych':
##
## wkappa
library('corpcor')
## Warning: package 'corpcor' was built under R version 3.4.1
library('fastmatch')
## Warning: package 'fastmatch' was built under R version 3.4.1
##
## Attaching package: 'fastmatch'
## The following object is masked from 'package:dplyr':
##
## coalesce
library('plyr')
## Warning: package 'plyr' was built under R version 3.4.1
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library('car')
library('ggcorrplot')
## Warning: package 'ggcorrplot' was built under R version 3.4.2
library('cluster')
library('caTools')
library('rpart')
library('rpart.plot')
## Warning: package 'rpart.plot' was built under R version 3.4.3
library('rattle')
## Warning: package 'rattle' was built under R version 3.4.2
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:randomForest':
##
## importance
library('RColorBrewer')
## Warning: package 'RColorBrewer' was built under R version 3.4.1
library('data.table')
library('ROCR')
library('maptree')
## Warning: package 'maptree' was built under R version 3.4.2
library('tree')
## Warning: package 'tree' was built under R version 3.4.3
library('dummies') # for converting categorical into dummy one
## Warning: package 'dummies' was built under R version 3.4.1
## dummies-1.5.6 provided by Decision Patterns
library('caret')
library('pscl') ## for McFadden R2
## Warning: package 'pscl' was built under R version 3.4.3
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library('randomForest')
library('StatMeasures')
## Warning: package 'StatMeasures' was built under R version 3.4.3
library('sqldf')
## Warning: package 'sqldf' was built under R version 3.4.3
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.4.1
## Loading required package: proto
## Warning: package 'proto' was built under R version 3.4.1
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.4.1
library('purrr')
## Warning: package 'purrr' was built under R version 3.4.3
##
## Attaching package: 'purrr'
## The following object is masked from 'package:plyr':
##
## compact
## The following object is masked from 'package:caret':
##
## lift
## The following object is masked from 'package:car':
##
## some
## The following object is masked from 'package:data.table':
##
## transpose
## The following object is masked from 'package:scales':
##
## discard
library('tidyr')
library('ggplot2')
library('gains')
## Warning: package 'gains' was built under R version 3.4.1
library('car')
library('MASS')
library('DiscriMiner')
## Warning: package 'DiscriMiner' was built under R version 3.4.3
library('klaR')
## Warning: package 'klaR' was built under R version 3.4.3
library('caret')
library('gridExtra')
## Warning: package 'gridExtra' was built under R version 3.4.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
Load the data and do basic preliminary validation
## [1] "age" "workclass" "fnlwgt" "education"
## [5] "education.num" "marital.status" "occupation" "relationship"
## [9] "race" "sex" "capital.gain" "capital.loss"
## [13] "hours.per.week" "native.country" "X"
## age workclass fnlwgt
## Min. :17.00 Private :22696 Min. : 12285
## 1st Qu.:28.00 Self-emp-not-inc: 2541 1st Qu.: 117827
## Median :37.00 Local-gov : 2093 Median : 178356
## Mean :38.58 ? : 1836 Mean : 189778
## 3rd Qu.:48.00 State-gov : 1298 3rd Qu.: 237051
## Max. :90.00 Self-emp-inc : 1116 Max. :1484705
## (Other) : 981
## education education.num marital.status
## HS-grad :10501 Min. : 1.00 Divorced : 4443
## Some-college: 7291 1st Qu.: 9.00 Married-AF-spouse : 23
## Bachelors : 5355 Median :10.00 Married-civ-spouse :14976
## Masters : 1723 Mean :10.08 Married-spouse-absent: 418
## Assoc-voc : 1382 3rd Qu.:12.00 Never-married :10683
## 11th : 1175 Max. :16.00 Separated : 1025
## (Other) : 5134 Widowed : 993
## occupation relationship
## Prof-specialty :4140 Husband :13193
## Craft-repair :4099 Not-in-family : 8305
## Exec-managerial:4066 Other-relative: 981
## Adm-clerical :3770 Own-child : 5068
## Sales :3650 Unmarried : 3446
## Other-service :3295 Wife : 1568
## (Other) :9541
## race sex capital.gain
## Amer-Indian-Eskimo: 311 Female:10771 Min. : 0
## Asian-Pac-Islander: 1039 Male :21790 1st Qu.: 0
## Black : 3124 Median : 0
## Other : 271 Mean : 1078
## White :27816 3rd Qu.: 0
## Max. :99999
##
## capital.loss hours.per.week native.country X
## Min. : 0.0 Min. : 1.00 United-States:29170 <=50K:24720
## 1st Qu.: 0.0 1st Qu.:40.00 Mexico : 643 >50K : 7841
## Median : 0.0 Median :40.00 ? : 583
## Mean : 87.3 Mean :40.44 Philippines : 198
## 3rd Qu.: 0.0 3rd Qu.:45.00 Germany : 137
## Max. :4356.0 Max. :99.00 Canada : 121
## (Other) : 1709
Doing basic box plot to understand the outliers and it seems that there are quite a bit outliers here also.
boxplot(adult_data$age, outline= TRUE, col = "blue")
boxplot(adult_data$capital.gain, outline= TRUE, col = "blue")
boxplot(adult_data$capital.loss, outline= TRUE, col = "blue")
histogram(adult_data$workclass)
Lets now understand the package and hour per week worked by employee and their salary cutoff..It is quite clear that most of the employees are investing approx 35-40 hours..
ggplot(data=adult_data, aes(x= adult_data$hours.per.week)) +
geom_histogram(col="red",fill="green", bins = 25) +
facet_grid(~adult_data$X)+
theme_bw()
Education and salary plot indicates clearly the education matters for salary but it is also clear that aopprox 8000 peoples are though having “Bachelor” (code 10) “HS-grad” (12) and “Some-college”(16).. This diagram dies not answer though why some “Bachelor” and “HS-grad” are earning less than <50K..So there are must be some other factors influencing this
adult_data$educationcode <- as.numeric(adult_data$education) ## added new field
ggplot(data=adult_data, aes(x= as.numeric(adult_data$education))) +
geom_histogram(col="red",fill="green", bins = 25) +
facet_grid(~adult_data$X)+
theme_bw()
myvalue <- filter(adult_data, adult_data$educationcode == 10 | 12 | 16)
## Warning: package 'bindrcpp' was built under R version 3.4.1
Further analysis is beingc arried out below to understand how and why same education have different salary level. Observations are summarised below.. –No of Women earing >50K is far less than men. So there is gender impact on salary
adult_data$educationcode <- as.numeric(adult_data$education) ## added new field
# doing
ggplot(data=adult_data, aes(x= as.numeric(adult_data$education))) +
geom_histogram(col="red",fill="green", bins = 25) +
facet_grid(~ adult_data$X + adult_data$sex)+
theme_bw()
## Doing race wise analysis also
ggplot(data=adult_data, aes(x= as.numeric(adult_data$education))) +
geom_histogram(col="red",fill="green", bins = 25) +
facet_grid(~ adult_data$X + adult_data$sex +adult_data$race)+
theme_bw()