Objective of this excercise is to study the Income data set from the source given below. Understand the variables used in the data set. Use transformation to create a tidy data set and subset the data for further analysis.
http://archive.ics.uci.edu/ml/machine-learning-databases/
Income_data<-read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header=FALSE,sep=",",stringsAsFactors = FALSE,na.strings="NA")
# naming the variables of data set
names(Income_data)<-c("age","workclass","fnlwgt","education","education_number","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country","class")
# looking at summary of the data
str(Income_data)
## 'data.frame': 32561 obs. of 15 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : chr " State-gov" " Self-emp-not-inc" " Private" " Private" ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : chr " Bachelors" " Bachelors" " HS-grad" " 11th" ...
## $ education_number: int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital_status : chr " Never-married" " Married-civ-spouse" " Divorced" " Married-civ-spouse" ...
## $ occupation : chr " Adm-clerical" " Exec-managerial" " Handlers-cleaners" " Handlers-cleaners" ...
## $ relationship : chr " Not-in-family" " Husband" " Not-in-family" " Husband" ...
## $ race : chr " White" " White" " White" " Black" ...
## $ sex : chr " Male" " Male" " Male" " Male" ...
## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours_per_week : int 40 13 40 40 40 40 16 45 50 40 ...
## $ native_country : chr " United-States" " United-States" " United-States" " United-States" ...
## $ class : chr " <=50K" " <=50K" " <=50K" " <=50K" ...
summary(Income_data)
## age workclass fnlwgt education
## Min. :17.00 Length:32561 Min. : 12285 Length:32561
## 1st Qu.:28.00 Class :character 1st Qu.: 117827 Class :character
## Median :37.00 Mode :character Median : 178356 Mode :character
## Mean :38.58 Mean : 189778
## 3rd Qu.:48.00 3rd Qu.: 237051
## Max. :90.00 Max. :1484705
## education_number marital_status occupation relationship
## Min. : 1.00 Length:32561 Length:32561 Length:32561
## 1st Qu.: 9.00 Class :character Class :character Class :character
## Median :10.00 Mode :character Mode :character Mode :character
## Mean :10.08
## 3rd Qu.:12.00
## Max. :16.00
## race sex capital_gain capital_loss
## Length:32561 Length:32561 Min. : 0 Min. : 0.0
## Class :character Class :character 1st Qu.: 0 1st Qu.: 0.0
## Mode :character Mode :character Median : 0 Median : 0.0
## Mean : 1078 Mean : 87.3
## 3rd Qu.: 0 3rd Qu.: 0.0
## Max. :99999 Max. :4356.0
## hours_per_week native_country class
## Min. : 1.00 Length:32561 Length:32561
## 1st Qu.:40.00 Class :character Class :character
## Median :40.00 Mode :character Mode :character
## Mean :40.44
## 3rd Qu.:45.00
## Max. :99.00
### Analysis- there are 32561 records and 15 variables in the data set.
##class variablle being the target variable related to income.
# looking at target variable class
unique(Income_data$class)
## [1] " <=50K" " >50K"
### Transformation- for further analysis of the data this variable to be transformed to nominal value 0 and 1 for ease of use. Income <= 50k -->0 ; Income >50k-->1
Income_data$class[Income_data$class==" <=50K"] <- 0
Income_data$class[Income_data$class==" >50K"] <- 1
Income_data$class<-as.factor(Income_data$class)
table(Income_data$class)
##
## 0 1
## 24720 7841
str(Income_data$class)
## Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 2 2 ...
unique(Income_data$class)
## [1] 0 1
## Levels: 0 1
### Analysis- Classification variable has been transformed to a factor variable with value 0 & 1 for ease of analysis
# looking at variable age and how it is related to target variable class
hist(Income_data$age)
barplot(table(Income_data$class,Income_data$age),legend.text = TRUE)
### Analysis- data distribution looks likeleft skewed with right long tail for age.
#It looks like people in data set have higher earnings between late 20s to early 60s. #There are few data points towards the end of the spectrum above 75 and below 17
# For making a better data set subset data by Age>70 and age<20--> subset condition...(1)
# Looking at Education_number & education variables
hist(Income_data$education_number,xlim = c(0,18))
unique(Income_data$education_number)
## [1] 13 9 7 14 5 10 12 11 4 16 15 3 6 2 1 8
### Analysis- data seems to have peek value around 8-10 education number
### Transformation of education variable to adjust the level
str(Income_data$education)
## chr [1:32561] " Bachelors" " Bachelors" " HS-grad" " 11th" ...
plot(table(Income_data$education),ylim=c(0,10000))
table(Income_data$education)
##
## 10th 11th 12th 1st-4th 5th-6th
## 933 1175 433 168 333
## 7th-8th 9th Assoc-acdm Assoc-voc Bachelors
## 646 514 1067 1382 5355
## Doctorate HS-grad Masters Preschool Prof-school
## 413 10501 1723 51 576
## Some-college
## 7291
Income_data$education<-ordered(Income_data$education,levels=c(" Preschool"," 1st-4th"," 5th-6th"," 7th-8th"," 9th"," 10th"," 11th"," 12th"," HS-grad"," Some-college"," Assoc-acdm"," Assoc-voc"," Bachelors"," Masters"," Prof-school"," Doctorate"))
Income_data$education<-as.factor(Income_data$education)
barplot(table( Income_data$class,Income_data$education),ylim = c(0,10000))
### Education variables seeems to have same replationship as like education number.
plot(Income_data$education, Income_data$education_number)
### Analysis- It indicates that education_number and education variable represent almost #the same value and one of those two are required in final data set ....
#subset condition ...(2)
#analysis of variable working class
unique(Income_data$workclass)
## [1] " State-gov" " Self-emp-not-inc" " Private"
## [4] " Federal-gov" " Local-gov" " ?"
## [7] " Self-emp-inc" " Without-pay" " Never-worked"
Income_data$workclass[Income_data$workclass==" ?"] <- "Not Known"
mosaicplot(table( Income_data$workclass,Income_data$class))
### Analysis-looking at the data and chart it appears that most of the observations are working in private concerns.
#There are some records where workclass having value ,?,Without pay, never worked' those data will not make sesnse to keep with the data set...
#Subset condition ..(3)
# Analysis of income data by marital status and race
barplot(table(Income_data$class,Income_data$marital_status),legend.text = TRUE,ylim=c(0,14000))
barplot(table(Income_data$class,Income_data$race),legend.text = TRUE,ylim=c(0,22000))
### Analysis- from above scenario it appears that race is a factor in the given data set which is related to class.
# Variable sex and income relationship
barplot(table(Income_data$class,Income_data$sex),legend.text =TRUE,ylim=c(0,20000))
tapply(Income_data$class,Income_data$sex,summary)
## $` Female`
## 0 1
## 9592 1179
##
## $` Male`
## 0 1
## 15128 6662
### Analysis- above two plot it appears that male has high proportion of observations earning more than 50k. So gender is a factor in income.
# How hours per week influence income
str(Income_data$hours_per_week)
## int [1:32561] 40 13 40 40 40 40 16 45 50 40 ...
plot(Income_data$class,Income_data$hours_per_week,ylim=c(0,100))
barplot(table(Income_data$class,Income_data$hours_per_week),legend.text = TRUE,ylim = c(0,14000))
### Analysis- It is very interesting to see that, median for both income class is close to 40,
#but for high income group IQR is higher than lower income group. But thete are lot of outliers in the data.
#This variable if requried to be used will need filtration of extrement cases.
# Country variable analysis
barplot(table(Income_data$class,Income_data$native_country),ylim = c(0,25000))
table(Income_data$native_country)
##
## ? Cambodia
## 583 19
## Canada China
## 121 75
## Columbia Cuba
## 59 95
## Dominican-Republic Ecuador
## 70 28
## El-Salvador England
## 106 90
## France Germany
## 29 137
## Greece Guatemala
## 29 64
## Haiti Holand-Netherlands
## 44 1
## Honduras Hong
## 13 20
## Hungary India
## 13 100
## Iran Ireland
## 43 24
## Italy Jamaica
## 73 81
## Japan Laos
## 62 18
## Mexico Nicaragua
## 643 34
## Outlying-US(Guam-USVI-etc) Peru
## 14 31
## Philippines Poland
## 198 60
## Portugal Puerto-Rico
## 37 114
## Scotland South
## 12 80
## Taiwan Thailand
## 51 18
## Trinadad&Tobago United-States
## 19 29170
## Vietnam Yugoslavia
## 67 16
length(Income_data$native_country[Income_data$native_country== " United-States"])/ length(Income_data$native_country)
## [1] 0.895857
### Analysis- looking at the data set by country variables it appears that USA has around 89.5 % data.
#Considering that it would make sense to subset only USA data for further analysis to get correct representation..
#subset condition.....(4)
# Variable Capital-gain and Capital loss
plot(Income_data$class,Income_data$capital_gain)
barplot(table(Income_data$class,Income_data$capital_gain))
length(Income_data$capital_gain[Income_data$capital_gain==0])/length(Income_data$capital_gain)
## [1] 0.9167102
### Analysis 91% data for varibale in 0 , this varibale can be removed in tiday .
# Subset condition...(5)
hist(Income_data$capital_loss)
length(Income_data$capital_loss[Income_data$capital_loss==0])/length(Income_data$capital_loss)
## [1] 0.9533491
### Analysis, 95% data for varibale in 0 , this varibale can be removed in tiday . Subset condition...(6)
#***** creation a tidy subset data with above mentioned conditions
#&& COndition 1 subset for Age>70 and age<20; condition 3, subset for workclass!= not Known, condition 5, subset for country =United States
Income_data_sub<-subset(Income_data, Income_data$age>=20 & Income_data$age<70 & Income_data$workclass!=c("Not Known") & Income_data$native_country==c(" United-States"))
nrow(Income_data)
## [1] 32561
nrow(Income_data_sub)
## [1] 25790
#&& subset data by removing columns which are not requried for further analysis, condition 2, Education, 5, capital-gain, capital-loss columns having 0 for most of the data
Income_data_sub<-subset(Income_data_sub,select=-c(education,capital_gain,capital_loss))
## creating factor variable of education levels
Income_data_sub$education_number<-ordered(Income_data_sub$education_number,levels=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16))
str(Income_data_sub)
## 'data.frame': 25790 obs. of 12 variables:
## $ age : int 39 50 38 53 37 52 31 42 37 23 ...
## $ workclass : chr " State-gov" " Self-emp-not-inc" " Private" " Private" ...
## $ fnlwgt : int 77516 83311 215646 234721 284582 209642 45781 159449 280464 122272 ...
## $ education_number: Ord.factor w/ 16 levels "1"<"2"<"3"<"4"<..: 13 13 9 7 14 9 14 13 10 13 ...
## $ marital_status : chr " Never-married" " Married-civ-spouse" " Divorced" " Married-civ-spouse" ...
## $ occupation : chr " Adm-clerical" " Exec-managerial" " Handlers-cleaners" " Handlers-cleaners" ...
## $ relationship : chr " Not-in-family" " Husband" " Not-in-family" " Husband" ...
## $ race : chr " White" " White" " White" " Black" ...
## $ sex : chr " Male" " Male" " Male" " Male" ...
## $ hours_per_week : int 40 13 40 40 40 45 50 40 80 30 ...
## $ native_country : chr " United-States" " United-States" " United-States" " United-States" ...
## $ class : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 2 2 2 1 ...
## Summary of final data file
summary(Income_data_sub)
## age workclass fnlwgt education_number
## Min. :20.00 Length:25790 Min. : 13769 9 :8721
## 1st Qu.:29.00 Class :character 1st Qu.: 115795 10 :5862
## Median :38.00 Mode :character Median : 176796 13 :4569
## Mean :38.93 Mean : 186866 14 :1459
## 3rd Qu.:47.00 3rd Qu.: 233511 11 :1220
## Max. :69.00 Max. :1484705 12 : 938
## (Other):3021
## marital_status occupation relationship
## Length:25790 Length:25790 Length:25790
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## race sex hours_per_week native_country
## Length:25790 Length:25790 Min. : 1.00 Length:25790
## Class :character Class :character 1st Qu.:40.00 Class :character
## Mode :character Mode :character Median :40.00 Mode :character
## Mean :41.91
## 3rd Qu.:45.00
## Max. :99.00
##
## class
## 0:18880
## 1: 6910
##
##
##
##
##
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.