R Markdown

Dataset can be obtained from https://archive.ics.uci.edu/ml/datasets/Adult and Prediction task is to determine whether a person makes over 50K a year.

# Load packages for analysis and this section will have all the required libraries mentioned for better clarity
library('lattice')
library('ggplot2') # visualization
## Warning: package 'ggplot2' was built under R version 3.4.1
library('ggthemes') # visualization
## Warning: package 'ggthemes' was built under R version 3.4.1
library('scales') # visualization
## Warning: package 'scales' was built under R version 3.4.1
library('dplyr') # data manipulation
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('mice') # imputation
## Warning: package 'mice' was built under R version 3.4.2
library('randomForest') # classification algorithm
## Warning: package 'randomForest' was built under R version 3.4.1
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library('rpart') # for decision tree
## Warning: package 'rpart' was built under R version 3.4.3
library('ROCR')
## Warning: package 'ROCR' was built under R version 3.4.1
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.1
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library('ROCR')
library('randomForest')
library('corrr')
## Warning: package 'corrr' was built under R version 3.4.1
library('corrplot')
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
library('glue')
## Warning: package 'glue' was built under R version 3.4.2
## 
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
## 
##     collapse
library('caTools')
## Warning: package 'caTools' was built under R version 3.4.1
library('data.table')
## Warning: package 'data.table' was built under R version 3.4.2
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
require("GGally")
## Loading required package: GGally
## Warning: package 'GGally' was built under R version 3.4.3
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
require("geosphere")
## Loading required package: geosphere
## Warning: package 'geosphere' was built under R version 3.4.2
require("gmapsdistance")
## Loading required package: gmapsdistance
## Warning: package 'gmapsdistance' was built under R version 3.4.2
require("tidyr")
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.4.2
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:mice':
## 
##     complete
library('corrplot')
#source("distance.R")
library('car')
## Warning: package 'car' was built under R version 3.4.2
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library('caret')
## Warning: package 'caret' was built under R version 3.4.3
library('gclus')
## Warning: package 'gclus' was built under R version 3.4.1
## Loading required package: cluster
## Warning: package 'cluster' was built under R version 3.4.2
library('visdat')
## Warning: package 'visdat' was built under R version 3.4.1
library('psych')
## Warning: package 'psych' was built under R version 3.4.2
## 
## Attaching package: 'psych'
## The following object is masked from 'package:car':
## 
##     logit
## The following object is masked from 'package:randomForest':
## 
##     outlier
## The following objects are masked from 'package:scales':
## 
##     alpha, rescale
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library('leaflet')
## Warning: package 'leaflet' was built under R version 3.4.1
library('leaflet.extras')
## Warning: package 'leaflet.extras' was built under R version 3.4.1
# library("PerformanceAnalytics")
library('GPArotation')
## Warning: package 'GPArotation' was built under R version 3.4.1
library('MVN')
## Warning: package 'MVN' was built under R version 3.4.2
## sROC 0.1-2 loaded
## 
## Attaching package: 'MVN'
## The following object is masked from 'package:psych':
## 
##     mardia
library('psych')
library('MASS')
## Warning: package 'MASS' was built under R version 3.4.3
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library('psy')
## Warning: package 'psy' was built under R version 3.4.1
## 
## Attaching package: 'psy'
## The following object is masked from 'package:psych':
## 
##     wkappa
library('corpcor')
## Warning: package 'corpcor' was built under R version 3.4.1
library('fastmatch')
## Warning: package 'fastmatch' was built under R version 3.4.1
## 
## Attaching package: 'fastmatch'
## The following object is masked from 'package:dplyr':
## 
##     coalesce
library('plyr')
## Warning: package 'plyr' was built under R version 3.4.1
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library('car')
library('ggcorrplot')
## Warning: package 'ggcorrplot' was built under R version 3.4.2
library('cluster')
library('caTools')
library('rpart')
library('rpart.plot')
## Warning: package 'rpart.plot' was built under R version 3.4.3
library('rattle')
## Warning: package 'rattle' was built under R version 3.4.2
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## 
## Attaching package: 'rattle'
## The following object is masked from 'package:randomForest':
## 
##     importance
library('RColorBrewer')
## Warning: package 'RColorBrewer' was built under R version 3.4.1
library('data.table')
library('ROCR')
library('maptree')
## Warning: package 'maptree' was built under R version 3.4.2
library('tree')
## Warning: package 'tree' was built under R version 3.4.3
library('dummies') # for converting categorical into dummy one
## Warning: package 'dummies' was built under R version 3.4.1
## dummies-1.5.6 provided by Decision Patterns
library('caret')
library('pscl') ## for  McFadden R2
## Warning: package 'pscl' was built under R version 3.4.3
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library('randomForest')
library('StatMeasures')
## Warning: package 'StatMeasures' was built under R version 3.4.3
library('sqldf')
## Warning: package 'sqldf' was built under R version 3.4.3
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.4.1
## Loading required package: proto
## Warning: package 'proto' was built under R version 3.4.1
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.4.1
library('purrr')
## Warning: package 'purrr' was built under R version 3.4.3
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:plyr':
## 
##     compact
## The following object is masked from 'package:caret':
## 
##     lift
## The following object is masked from 'package:car':
## 
##     some
## The following object is masked from 'package:data.table':
## 
##     transpose
## The following object is masked from 'package:scales':
## 
##     discard
library('tidyr')
library('ggplot2')
library('gains')
## Warning: package 'gains' was built under R version 3.4.1
library('car')
library('MASS')
library('DiscriMiner')
## Warning: package 'DiscriMiner' was built under R version 3.4.3
library('klaR')
## Warning: package 'klaR' was built under R version 3.4.3
library('caret')
library('gridExtra')
## Warning: package 'gridExtra' was built under R version 3.4.3
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine

Including Plots..

Load the data and do basic preliminary validation

##  [1] "age"            "workclass"      "fnlwgt"         "education"     
##  [5] "education.num"  "marital.status" "occupation"     "relationship"  
##  [9] "race"           "sex"            "capital.gain"   "capital.loss"  
## [13] "hours.per.week" "native.country" "X"
##       age                    workclass         fnlwgt       
##  Min.   :17.00    Private         :22696   Min.   :  12285  
##  1st Qu.:28.00    Self-emp-not-inc: 2541   1st Qu.: 117827  
##  Median :37.00    Local-gov       : 2093   Median : 178356  
##  Mean   :38.58    ?               : 1836   Mean   : 189778  
##  3rd Qu.:48.00    State-gov       : 1298   3rd Qu.: 237051  
##  Max.   :90.00    Self-emp-inc    : 1116   Max.   :1484705  
##                  (Other)          :  981                    
##          education     education.num                  marital.status 
##   HS-grad     :10501   Min.   : 1.00    Divorced             : 4443  
##   Some-college: 7291   1st Qu.: 9.00    Married-AF-spouse    :   23  
##   Bachelors   : 5355   Median :10.00    Married-civ-spouse   :14976  
##   Masters     : 1723   Mean   :10.08    Married-spouse-absent:  418  
##   Assoc-voc   : 1382   3rd Qu.:12.00    Never-married        :10683  
##   11th        : 1175   Max.   :16.00    Separated            : 1025  
##  (Other)      : 5134                    Widowed              :  993  
##             occupation            relationship  
##   Prof-specialty :4140    Husband       :13193  
##   Craft-repair   :4099    Not-in-family : 8305  
##   Exec-managerial:4066    Other-relative:  981  
##   Adm-clerical   :3770    Own-child     : 5068  
##   Sales          :3650    Unmarried     : 3446  
##   Other-service  :3295    Wife          : 1568  
##  (Other)         :9541                          
##                   race            sex         capital.gain  
##   Amer-Indian-Eskimo:  311    Female:10771   Min.   :    0  
##   Asian-Pac-Islander: 1039    Male  :21790   1st Qu.:    0  
##   Black             : 3124                   Median :    0  
##   Other             :  271                   Mean   : 1078  
##   White             :27816                   3rd Qu.:    0  
##                                              Max.   :99999  
##                                                             
##   capital.loss    hours.per.week         native.country       X        
##  Min.   :   0.0   Min.   : 1.00    United-States:29170    <=50K:24720  
##  1st Qu.:   0.0   1st Qu.:40.00    Mexico       :  643    >50K : 7841  
##  Median :   0.0   Median :40.00    ?            :  583                 
##  Mean   :  87.3   Mean   :40.44    Philippines  :  198                 
##  3rd Qu.:   0.0   3rd Qu.:45.00    Germany      :  137                 
##  Max.   :4356.0   Max.   :99.00    Canada       :  121                 
##                                   (Other)       : 1709

Doing basic box plot to understand the outliers and it seems that there are quite a bit outliers here also.

boxplot(adult_data$age, outline= TRUE, col = "blue")

boxplot(adult_data$capital.gain, outline= TRUE, col = "blue")

boxplot(adult_data$capital.loss, outline= TRUE, col = "blue")

histogram(adult_data$workclass)

Lets now understand the package and hour per week worked by employee and their salary cutoff..It is quite clear that most of the employees are investing approx 35-40 hours..

ggplot(data=adult_data, aes(x= adult_data$hours.per.week)) + 
  geom_histogram(col="red",fill="green", bins = 25) +
  facet_grid(~adult_data$X)+
  theme_bw()

Education and salary plot indicates clearly the education matters for salary but it is also clear that aopprox 8000 peoples are though having “Bachelor” (code 10) “HS-grad” (12) and “Some-college”(16).. This diagram dies not answer though why some “Bachelor” and “HS-grad” are earning less than <50K..So there are must be some other factors influencing this

adult_data$educationcode <- as.numeric(adult_data$education)  ## added new field

ggplot(data=adult_data, aes(x= as.numeric(adult_data$education))) + 
  geom_histogram(col="red",fill="green", bins = 25) +
  facet_grid(~adult_data$X)+
  theme_bw()

myvalue <- filter(adult_data, adult_data$educationcode == 10 | 12 | 16)
## Warning: package 'bindrcpp' was built under R version 3.4.1

Further analysis is beingc arried out below to understand how and why same education have different salary level. Observations are summarised below.. –No of Women earing >50K is far less than men. So there is gender impact on salary

adult_data$educationcode <- as.numeric(adult_data$education)  ## added new field

# doing 
ggplot(data=adult_data, aes(x= as.numeric(adult_data$education))) + 
  geom_histogram(col="red",fill="green", bins = 25) +
  facet_grid(~ adult_data$X + adult_data$sex)+
  theme_bw()

## Doing race wise analysis also
ggplot(data=adult_data, aes(x= as.numeric(adult_data$education))) + 
  geom_histogram(col="red",fill="green", bins = 25) +
  facet_grid(~ adult_data$X + adult_data$sex +adult_data$race)+
  theme_bw()