Objective of this excercise is to study the Income data set from the source given below. Understand the variables used in the data set. Use transformation to create a tidy data set and subset the data for further analysis.

http://archive.ics.uci.edu/ml/machine-learning-databases/

Income_data<-read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header=FALSE,sep=",",stringsAsFactors = FALSE,na.strings="NA")

# naming the variables of data set

names(Income_data)<-c("age","workclass","fnlwgt","education","education_number","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country","class")

# looking at summary of the data

str(Income_data)
## 'data.frame':    32561 obs. of  15 variables:
##  $ age             : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ workclass       : chr  " State-gov" " Self-emp-not-inc" " Private" " Private" ...
##  $ fnlwgt          : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education       : chr  " Bachelors" " Bachelors" " HS-grad" " 11th" ...
##  $ education_number: int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital_status  : chr  " Never-married" " Married-civ-spouse" " Divorced" " Married-civ-spouse" ...
##  $ occupation      : chr  " Adm-clerical" " Exec-managerial" " Handlers-cleaners" " Handlers-cleaners" ...
##  $ relationship    : chr  " Not-in-family" " Husband" " Not-in-family" " Husband" ...
##  $ race            : chr  " White" " White" " White" " Black" ...
##  $ sex             : chr  " Male" " Male" " Male" " Male" ...
##  $ capital_gain    : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital_loss    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours_per_week  : int  40 13 40 40 40 40 16 45 50 40 ...
##  $ native_country  : chr  " United-States" " United-States" " United-States" " United-States" ...
##  $ class           : chr  " <=50K" " <=50K" " <=50K" " <=50K" ...
summary(Income_data)
##       age         workclass             fnlwgt         education        
##  Min.   :17.00   Length:32561       Min.   :  12285   Length:32561      
##  1st Qu.:28.00   Class :character   1st Qu.: 117827   Class :character  
##  Median :37.00   Mode  :character   Median : 178356   Mode  :character  
##  Mean   :38.58                      Mean   : 189778                     
##  3rd Qu.:48.00                      3rd Qu.: 237051                     
##  Max.   :90.00                      Max.   :1484705                     
##  education_number marital_status      occupation        relationship      
##  Min.   : 1.00    Length:32561       Length:32561       Length:32561      
##  1st Qu.: 9.00    Class :character   Class :character   Class :character  
##  Median :10.00    Mode  :character   Mode  :character   Mode  :character  
##  Mean   :10.08                                                            
##  3rd Qu.:12.00                                                            
##  Max.   :16.00                                                            
##      race               sex             capital_gain    capital_loss   
##  Length:32561       Length:32561       Min.   :    0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:    0   1st Qu.:   0.0  
##  Mode  :character   Mode  :character   Median :    0   Median :   0.0  
##                                        Mean   : 1078   Mean   :  87.3  
##                                        3rd Qu.:    0   3rd Qu.:   0.0  
##                                        Max.   :99999   Max.   :4356.0  
##  hours_per_week  native_country        class          
##  Min.   : 1.00   Length:32561       Length:32561      
##  1st Qu.:40.00   Class :character   Class :character  
##  Median :40.00   Mode  :character   Mode  :character  
##  Mean   :40.44                                        
##  3rd Qu.:45.00                                        
##  Max.   :99.00
### Analysis- there are 32561 records and 15 variables in the data set.
##class variablle being the target variable related to income.



# looking at target variable class

unique(Income_data$class)
## [1] " <=50K" " >50K"
### Transformation- for further analysis of the data this variable to be transformed to nominal value 0 and 1 for ease of use.  Income <= 50k -->0 ; Income >50k-->1

Income_data$class[Income_data$class==" <=50K"] <- 0
Income_data$class[Income_data$class==" >50K"] <- 1

Income_data$class<-as.factor(Income_data$class)

table(Income_data$class)
## 
##     0     1 
## 24720  7841
str(Income_data$class)
##  Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 2 2 ...
unique(Income_data$class)
## [1] 0 1
## Levels: 0 1
### Analysis- Classification variable has been transformed to a factor variable with value 0 & 1 for ease of analysis


# looking at variable age and how it is related to target variable class

hist(Income_data$age)

barplot(table(Income_data$class,Income_data$age),legend.text = TRUE)

### Analysis- data distribution looks likeleft skewed with right long tail for age.
#It looks like people in data set have higher earnings between late 20s to early 60s. #There are few data points towards the end of the spectrum above 75 and below 17 
# For making a better data set subset data by  Age>70 and age<20--> subset condition...(1) 


# Looking at Education_number & education variables

hist(Income_data$education_number,xlim = c(0,18))

unique(Income_data$education_number)
##  [1] 13  9  7 14  5 10 12 11  4 16 15  3  6  2  1  8
### Analysis- data seems to have peek value around 8-10 education number

### Transformation of education variable to adjust the level

str(Income_data$education)
##  chr [1:32561] " Bachelors" " Bachelors" " HS-grad" " 11th" ...
plot(table(Income_data$education),ylim=c(0,10000))

table(Income_data$education)
## 
##          10th          11th          12th       1st-4th       5th-6th 
##           933          1175           433           168           333 
##       7th-8th           9th    Assoc-acdm     Assoc-voc     Bachelors 
##           646           514          1067          1382          5355 
##     Doctorate       HS-grad       Masters     Preschool   Prof-school 
##           413         10501          1723            51           576 
##  Some-college 
##          7291
Income_data$education<-ordered(Income_data$education,levels=c(" Preschool"," 1st-4th"," 5th-6th"," 7th-8th"," 9th"," 10th"," 11th"," 12th"," HS-grad"," Some-college"," Assoc-acdm"," Assoc-voc"," Bachelors"," Masters"," Prof-school"," Doctorate"))


Income_data$education<-as.factor(Income_data$education)


barplot(table( Income_data$class,Income_data$education),ylim = c(0,10000))

###  Education variables seeems to have same replationship as like education number. 


plot(Income_data$education, Income_data$education_number)

### Analysis- It indicates that education_number and education variable represent almost #the same value and one of those two are required in final data set ....
#subset condition ...(2)

#analysis of variable working class

unique(Income_data$workclass)
## [1] " State-gov"        " Self-emp-not-inc" " Private"         
## [4] " Federal-gov"      " Local-gov"        " ?"               
## [7] " Self-emp-inc"     " Without-pay"      " Never-worked"
Income_data$workclass[Income_data$workclass==" ?"] <- "Not Known"

mosaicplot(table( Income_data$workclass,Income_data$class))

### Analysis-looking at the data and chart it appears that most of the observations are working in private concerns. 
#There are some records where workclass having value ,?,Without pay, never worked' those data will not make sesnse to keep with the data set...
#Subset condition ..(3)


# Analysis of income data by marital status and race


barplot(table(Income_data$class,Income_data$marital_status),legend.text = TRUE,ylim=c(0,14000))

barplot(table(Income_data$class,Income_data$race),legend.text = TRUE,ylim=c(0,22000))

### Analysis- from above scenario it appears that race is a factor in the given data set which is related to class.



# Variable sex and income relationship

barplot(table(Income_data$class,Income_data$sex),legend.text =TRUE,ylim=c(0,20000))

tapply(Income_data$class,Income_data$sex,summary)
## $` Female`
##    0    1 
## 9592 1179 
## 
## $` Male`
##     0     1 
## 15128  6662
### Analysis- above two plot it appears that male has high proportion of observations earning more than 50k. So gender is a factor in income.



# How hours per week influence income

str(Income_data$hours_per_week)
##  int [1:32561] 40 13 40 40 40 40 16 45 50 40 ...
plot(Income_data$class,Income_data$hours_per_week,ylim=c(0,100))

barplot(table(Income_data$class,Income_data$hours_per_week),legend.text = TRUE,ylim = c(0,14000))

### Analysis- It is very interesting to see that, median for both income class is close to 40, 
#but for high income group IQR is higher than lower income group. But thete are lot of outliers in the data. 
#This variable if requried to be used will need filtration of extrement cases. 



# Country variable analysis

barplot(table(Income_data$class,Income_data$native_country),ylim = c(0,25000))

table(Income_data$native_country)
## 
##                           ?                    Cambodia 
##                         583                          19 
##                      Canada                       China 
##                         121                          75 
##                    Columbia                        Cuba 
##                          59                          95 
##          Dominican-Republic                     Ecuador 
##                          70                          28 
##                 El-Salvador                     England 
##                         106                          90 
##                      France                     Germany 
##                          29                         137 
##                      Greece                   Guatemala 
##                          29                          64 
##                       Haiti          Holand-Netherlands 
##                          44                           1 
##                    Honduras                        Hong 
##                          13                          20 
##                     Hungary                       India 
##                          13                         100 
##                        Iran                     Ireland 
##                          43                          24 
##                       Italy                     Jamaica 
##                          73                          81 
##                       Japan                        Laos 
##                          62                          18 
##                      Mexico                   Nicaragua 
##                         643                          34 
##  Outlying-US(Guam-USVI-etc)                        Peru 
##                          14                          31 
##                 Philippines                      Poland 
##                         198                          60 
##                    Portugal                 Puerto-Rico 
##                          37                         114 
##                    Scotland                       South 
##                          12                          80 
##                      Taiwan                    Thailand 
##                          51                          18 
##             Trinadad&Tobago               United-States 
##                          19                       29170 
##                     Vietnam                  Yugoslavia 
##                          67                          16
length(Income_data$native_country[Income_data$native_country== " United-States"])/ length(Income_data$native_country)
## [1] 0.895857
### Analysis- looking at the data set by country variables it appears that USA has around 89.5 % data. 
#Considering that it would make sense to subset only USA data for further analysis to get correct representation..
#subset condition.....(4)


# Variable Capital-gain and Capital loss

plot(Income_data$class,Income_data$capital_gain)

barplot(table(Income_data$class,Income_data$capital_gain))

length(Income_data$capital_gain[Income_data$capital_gain==0])/length(Income_data$capital_gain)
## [1] 0.9167102
### Analysis 91% data for varibale in 0 , this varibale can be removed in tiday . 
# Subset condition...(5)



hist(Income_data$capital_loss)

length(Income_data$capital_loss[Income_data$capital_loss==0])/length(Income_data$capital_loss)
## [1] 0.9533491
### Analysis, 95% data for varibale in 0 , this varibale can be removed in tiday . Subset condition...(6)



#***** creation a tidy subset data with above mentioned conditions

#&& COndition 1 subset for Age>70 and age<20; condition 3, subset for workclass!= not Known, condition 5, subset for country =United States

Income_data_sub<-subset(Income_data, Income_data$age>=20 & Income_data$age<70   & Income_data$workclass!=c("Not Known") & Income_data$native_country==c(" United-States"))

nrow(Income_data)
## [1] 32561
nrow(Income_data_sub)
## [1] 25790
#&& subset data by removing columns which are not requried for further analysis, condition 2, Education, 5, capital-gain, capital-loss columns having 0 for most of the data

Income_data_sub<-subset(Income_data_sub,select=-c(education,capital_gain,capital_loss))


## creating factor variable of education levels
Income_data_sub$education_number<-ordered(Income_data_sub$education_number,levels=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16))


str(Income_data_sub)
## 'data.frame':    25790 obs. of  12 variables:
##  $ age             : int  39 50 38 53 37 52 31 42 37 23 ...
##  $ workclass       : chr  " State-gov" " Self-emp-not-inc" " Private" " Private" ...
##  $ fnlwgt          : int  77516 83311 215646 234721 284582 209642 45781 159449 280464 122272 ...
##  $ education_number: Ord.factor w/ 16 levels "1"<"2"<"3"<"4"<..: 13 13 9 7 14 9 14 13 10 13 ...
##  $ marital_status  : chr  " Never-married" " Married-civ-spouse" " Divorced" " Married-civ-spouse" ...
##  $ occupation      : chr  " Adm-clerical" " Exec-managerial" " Handlers-cleaners" " Handlers-cleaners" ...
##  $ relationship    : chr  " Not-in-family" " Husband" " Not-in-family" " Husband" ...
##  $ race            : chr  " White" " White" " White" " Black" ...
##  $ sex             : chr  " Male" " Male" " Male" " Male" ...
##  $ hours_per_week  : int  40 13 40 40 40 45 50 40 80 30 ...
##  $ native_country  : chr  " United-States" " United-States" " United-States" " United-States" ...
##  $ class           : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 2 2 2 1 ...
## Summary of final data file 
summary(Income_data_sub)
##       age         workclass             fnlwgt        education_number
##  Min.   :20.00   Length:25790       Min.   :  13769   9      :8721    
##  1st Qu.:29.00   Class :character   1st Qu.: 115795   10     :5862    
##  Median :38.00   Mode  :character   Median : 176796   13     :4569    
##  Mean   :38.93                      Mean   : 186866   14     :1459    
##  3rd Qu.:47.00                      3rd Qu.: 233511   11     :1220    
##  Max.   :69.00                      Max.   :1484705   12     : 938    
##                                                       (Other):3021    
##  marital_status      occupation        relationship      
##  Length:25790       Length:25790       Length:25790      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##      race               sex            hours_per_week  native_country    
##  Length:25790       Length:25790       Min.   : 1.00   Length:25790      
##  Class :character   Class :character   1st Qu.:40.00   Class :character  
##  Mode  :character   Mode  :character   Median :40.00   Mode  :character  
##                                        Mean   :41.91                     
##                                        3rd Qu.:45.00                     
##                                        Max.   :99.00                     
##                                                                          
##  class    
##  0:18880  
##  1: 6910  
##           
##           
##           
##           
## 

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.