Working with Income data set

Objective of this excercise is to study the Income data set from the source given below. Understand the variables used in the data set. Use transformation to create a tidy data set and subset the data for further analysis.

http://archive.ics.uci.edu/ml/machine-learning-databases/

Income_data<-read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header=FALSE,sep=",",stringsAsFactors = FALSE,na.strings="NA")

# naming the variables of data set

names(Income_data)<-c("age","workclass","fnlwgt","education","education_number","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country","class")

# looking at summary of the data

str(Income_data)

## 'data.frame':    32561 obs. of  15 variables:
##  $ age             : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ workclass       : chr  " State-gov" " Self-emp-not-inc" " Private" " Private" ...
##  $ fnlwgt          : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education       : chr  " Bachelors" " Bachelors" " HS-grad" " 11th" ...
##  $ education_number: int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital_status  : chr  " Never-married" " Married-civ-spouse" " Divorced" " Married-civ-spouse" ...
##  $ occupation      : chr  " Adm-clerical" " Exec-managerial" " Handlers-cleaners" " Handlers-cleaners" ...
##  $ relationship    : chr  " Not-in-family" " Husband" " Not-in-family" " Husband" ...
##  $ race            : chr  " White" " White" " White" " Black" ...
##  $ sex             : chr  " Male" " Male" " Male" " Male" ...
##  $ capital_gain    : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital_loss    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours_per_week  : int  40 13 40 40 40 40 16 45 50 40 ...
##  $ native_country  : chr  " United-States" " United-States" " United-States" " United-States" ...
##  $ class           : chr  " <=50K" " <=50K" " <=50K" " <=50K" ...

summary(Income_data)

##       age         workclass             fnlwgt         education        
##  Min.   :17.00   Length:32561       Min.   :  12285   Length:32561      
##  1st Qu.:28.00   Class :character   1st Qu.: 117827   Class :character  
##  Median :37.00   Mode  :character   Median : 178356   Mode  :character  
##  Mean   :38.58                      Mean   : 189778                     
##  3rd Qu.:48.00                      3rd Qu.: 237051                     
##  Max.   :90.00                      Max.   :1484705                     
##  education_number marital_status      occupation        relationship      
##  Min.   : 1.00    Length:32561       Length:32561       Length:32561      
##  1st Qu.: 9.00    Class :character   Class :character   Class :character  
##  Median :10.00    Mode  :character   Mode  :character   Mode  :character  
##  Mean   :10.08                                                            
##  3rd Qu.:12.00                                                            
##  Max.   :16.00                                                            
##      race               sex             capital_gain    capital_loss   
##  Length:32561       Length:32561       Min.   :    0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:    0   1st Qu.:   0.0  
##  Mode  :character   Mode  :character   Median :    0   Median :   0.0  
##                                        Mean   : 1078   Mean   :  87.3  
##                                        3rd Qu.:    0   3rd Qu.:   0.0  
##                                        Max.   :99999   Max.   :4356.0  
##  hours_per_week  native_country        class          
##  Min.   : 1.00   Length:32561       Length:32561      
##  1st Qu.:40.00   Class :character   Class :character  
##  Median :40.00   Mode  :character   Mode  :character  
##  Mean   :40.44                                        
##  3rd Qu.:45.00                                        
##  Max.   :99.00

### Analysis- there are 32561 records and 15 variables in the data set.
##class variablle being the target variable related to income.



# looking at target variable class

unique(Income_data$class)

## [1] " <=50K" " >50K"

### Transformation- for further analysis of the data this variable to be transformed to nominal value 0 and 1 for ease of use.  Income <= 50k -->0 ; Income >50k-->1

Income_data$class[Income_data$class==" <=50K"] <- 0
Income_data$class[Income_data$class==" >50K"] <- 1

Income_data$class<-as.factor(Income_data$class)

table(Income_data$class)

## 
##     0     1 
## 24720  7841

str(Income_data$class)

##  Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 2 2 ...

unique(Income_data$class)

## [1] 0 1
## Levels: 0 1

### Analysis- Classification variable has been transformed to a factor variable with value 0 & 1 for ease of analysis


# looking at variable age and how it is related to target variable class

hist(Income_data$age)

barplot(table(Income_data$class,Income_data$age),legend.text = TRUE)

### Analysis- data distribution looks likeleft skewed with right long tail for age.
#It looks like people in data set have higher earnings between late 20s to early 60s. #There are few data points towards the end of the spectrum above 75 and below 17 
# For making a better data set subset data by  Age>70 and age<20--> subset condition...(1) 


# Looking at Education_number & education variables

hist(Income_data$education_number,xlim = c(0,18))

unique(Income_data$education_number)

##  [1] 13  9  7 14  5 10 12 11  4 16 15  3  6  2  1  8

### Analysis- data seems to have peek value around 8-10 education number

### Transformation of education variable to adjust the level

str(Income_data$education)

##  chr [1:32561] " Bachelors" " Bachelors" " HS-grad" " 11th" ...

plot(table(Income_data$education),ylim=c(0,10000))

table(Income_data$education)

## 
##          10th          11th          12th       1st-4th       5th-6th 
##           933          1175           433           168           333 
##       7th-8th           9th    Assoc-acdm     Assoc-voc     Bachelors 
##           646           514          1067          1382          5355 
##     Doctorate       HS-grad       Masters     Preschool   Prof-school 
##           413         10501          1723            51           576 
##  Some-college 
##          7291

Income_data$education<-ordered(Income_data$education,levels=c(" Preschool"," 1st-4th"," 5th-6th"," 7th-8th"," 9th"," 10th"," 11th"," 12th"," HS-grad"," Some-college"," Assoc-acdm"," Assoc-voc"," Bachelors"," Masters"," Prof-school"," Doctorate"))


Income_data$education<-as.factor(Income_data$education)


barplot(table( Income_data$class,Income_data$education),ylim = c(0,10000))

###  Education variables seeems to have same replationship as like education number. 


plot(Income_data$education, Income_data$education_number)

### Analysis- It indicates that education_number and education variable represent almost #the same value and one of those two are required in final data set ....
#subset condition ...(2)

#analysis of variable working class

unique(Income_data$workclass)

## [1] " State-gov"        " Self-emp-not-inc" " Private"         
## [4] " Federal-gov"      " Local-gov"        " ?"               
## [7] " Self-emp-inc"     " Without-pay"      " Never-worked"

Income_data$workclass[Income_data$workclass==" ?"] <- "Not Known"

mosaicplot(table( Income_data$workclass,Income_data$class))

### Analysis-looking at the data and chart it appears that most of the observations are working in private concerns. 
#There are some records where workclass having value ,?,Without pay, never worked' those data will not make sesnse to keep with the data set...
#Subset condition ..(3)


# Analysis of income data by marital status and race


barplot(table(Income_data$class,Income_data$marital_status),legend.text = TRUE,ylim=c(0,14000))

barplot(table(Income_data$class,Income_data$race),legend.text = TRUE,ylim=c(0,22000))

### Analysis- from above scenario it appears that race is a factor in the given data set which is related to class.



# Variable sex and income relationship

barplot(table(Income_data$class,Income_data$sex),legend.text =TRUE,ylim=c(0,20000))

tapply(Income_data$class,Income_data$sex,summary)

## $` Female`
##    0    1 
## 9592 1179 
## 
## $` Male`
##     0     1 
## 15128  6662

### Analysis- above two plot it appears that male has high proportion of observations earning more than 50k. So gender is a factor in income.



# How hours per week influence income

str(Income_data$hours_per_week)

##  int [1:32561] 40 13 40 40 40 40 16 45 50 40 ...

plot(Income_data$class,Income_data$hours_per_week,ylim=c(0,100))

barplot(table(Income_data$class,Income_data$hours_per_week),legend.text = TRUE,ylim = c(0,14000))

### Analysis- It is very interesting to see that, median for both income class is close to 40, 
#but for high income group IQR is higher than lower income group. But thete are lot of outliers in the data. 
#This variable if requried to be used will need filtration of extrement cases. 



# Country variable analysis

barplot(table(Income_data$class,Income_data$native_country),ylim = c(0,25000))

table(Income_data$native_country)

## 
##                           ?                    Cambodia 
##                         583                          19 
##                      Canada                       China 
##                         121                          75 
##                    Columbia                        Cuba 
##                          59                          95 
##          Dominican-Republic                     Ecuador 
##                          70                          28 
##                 El-Salvador                     England 
##                         106                          90 
##                      France                     Germany 
##                          29                         137 
##                      Greece                   Guatemala 
##                          29                          64 
##                       Haiti          Holand-Netherlands 
##                          44                           1 
##                    Honduras                        Hong 
##                          13                          20 
##                     Hungary                       India 
##                          13                         100 
##                        Iran                     Ireland 
##                          43                          24 
##                       Italy                     Jamaica 
##                          73                          81 
##                       Japan                        Laos 
##                          62                          18 
##                      Mexico                   Nicaragua 
##                         643                          34 
##  Outlying-US(Guam-USVI-etc)                        Peru 
##                          14                          31 
##                 Philippines                      Poland 
##                         198                          60 
##                    Portugal                 Puerto-Rico 
##                          37                         114 
##                    Scotland                       South 
##                          12                          80 
##                      Taiwan                    Thailand 
##                          51                          18 
##             Trinadad&Tobago               United-States 
##                          19                       29170 
##                     Vietnam                  Yugoslavia 
##                          67                          16

length(Income_data$native_country[Income_data$native_country== " United-States"])/ length(Income_data$native_country)

## [1] 0.895857

### Analysis- looking at the data set by country variables it appears that USA has around 89.5 % data. 
#Considering that it would make sense to subset only USA data for further analysis to get correct representation..
#subset condition.....(4)


# Variable Capital-gain and Capital loss

plot(Income_data$class,Income_data$capital_gain)

barplot(table(Income_data$class,Income_data$capital_gain))

length(Income_data$capital_gain[Income_data$capital_gain==0])/length(Income_data$capital_gain)

## [1] 0.9167102

### Analysis 91% data for varibale in 0 , this varibale can be removed in tiday . 
# Subset condition...(5)



hist(Income_data$capital_loss)

length(Income_data$capital_loss[Income_data$capital_loss==0])/length(Income_data$capital_loss)

## [1] 0.9533491

### Analysis, 95% data for varibale in 0 , this varibale can be removed in tiday . Subset condition...(6)



#***** creation a tidy subset data with above mentioned conditions

#&& COndition 1 subset for Age>70 and age<20; condition 3, subset for workclass!= not Known, condition 5, subset for country =United States

Income_data_sub<-subset(Income_data, Income_data$age>=20 & Income_data$age<70   & Income_data$workclass!=c("Not Known") & Income_data$native_country==c(" United-States"))

nrow(Income_data)

## [1] 32561

nrow(Income_data_sub)

## [1] 25790

#&& subset data by removing columns which are not requried for further analysis, condition 2, Education, 5, capital-gain, capital-loss columns having 0 for most of the data

Income_data_sub<-subset(Income_data_sub,select=-c(education,capital_gain,capital_loss))


## creating factor variable of education levels
Income_data_sub$education_number<-ordered(Income_data_sub$education_number,levels=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16))


str(Income_data_sub)

## 'data.frame':    25790 obs. of  12 variables:
##  $ age             : int  39 50 38 53 37 52 31 42 37 23 ...
##  $ workclass       : chr  " State-gov" " Self-emp-not-inc" " Private" " Private" ...
##  $ fnlwgt          : int  77516 83311 215646 234721 284582 209642 45781 159449 280464 122272 ...
##  $ education_number: Ord.factor w/ 16 levels "1"<"2"<"3"<"4"<..: 13 13 9 7 14 9 14 13 10 13 ...
##  $ marital_status  : chr  " Never-married" " Married-civ-spouse" " Divorced" " Married-civ-spouse" ...
##  $ occupation      : chr  " Adm-clerical" " Exec-managerial" " Handlers-cleaners" " Handlers-cleaners" ...
##  $ relationship    : chr  " Not-in-family" " Husband" " Not-in-family" " Husband" ...
##  $ race            : chr  " White" " White" " White" " Black" ...
##  $ sex             : chr  " Male" " Male" " Male" " Male" ...
##  $ hours_per_week  : int  40 13 40 40 40 45 50 40 80 30 ...
##  $ native_country  : chr  " United-States" " United-States" " United-States" " United-States" ...
##  $ class           : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 2 2 2 1 ...

## Summary of final data file 
summary(Income_data_sub)

##       age         workclass             fnlwgt        education_number
##  Min.   :20.00   Length:25790       Min.   :  13769   9      :8721    
##  1st Qu.:29.00   Class :character   1st Qu.: 115795   10     :5862    
##  Median :38.00   Mode  :character   Median : 176796   13     :4569    
##  Mean   :38.93                      Mean   : 186866   14     :1459    
##  3rd Qu.:47.00                      3rd Qu.: 233511   11     :1220    
##  Max.   :69.00                      Max.   :1484705   12     : 938    
##                                                       (Other):3021    
##  marital_status      occupation        relationship      
##  Length:25790       Length:25790       Length:25790      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##      race               sex            hours_per_week  native_country    
##  Length:25790       Length:25790       Min.   : 1.00   Length:25790      
##  Class :character   Class :character   1st Qu.:40.00   Class :character  
##  Mode  :character   Mode  :character   Median :40.00   Mode  :character  
##                                        Mean   :41.91                     
##                                        3rd Qu.:45.00                     
##                                        Max.   :99.00                     
##                                                                          
##  class    
##  0:18880  
##  1: 6910  
##           
##           
##           
##           
##

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Working with Income data set

Arindam

February 6, 2016