About

This page provides simple analysis of census data. The dataset used in the page is sourced from https://archive.ics.uci.edu/ml/datasets/Adult.

Load Data

Let’s read and examine the data.

file_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
income=read.table(file=file_url,sep=",")
str(income)
## 'data.frame':    32561 obs. of  15 variables:
##  $ V1 : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ V2 : Factor w/ 9 levels " ?"," Federal-gov",..: 8 7 5 5 5 5 5 7 5 5 ...
##  $ V3 : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ V4 : Factor w/ 16 levels " 10th"," 11th",..: 10 10 12 2 10 13 7 12 13 10 ...
##  $ V5 : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ V6 : Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
##  $ V7 : Factor w/ 15 levels " ?"," Adm-clerical",..: 2 5 7 7 11 5 9 5 11 5 ...
##  $ V8 : Factor w/ 6 levels " Husband"," Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
##  $ V9 : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
##  $ V10: Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 1 1 2 1 2 ...
##  $ V11: int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ V12: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ V13: int  40 13 40 40 40 40 16 45 50 40 ...
##  $ V14: Factor w/ 42 levels " ?"," Cambodia",..: 40 40 40 40 6 40 24 40 40 40 ...
##  $ V15: Factor w/ 2 levels " <=50K"," >50K": 1 1 1 1 1 1 1 2 2 2 ...

Name the variables

colnames(income)<-c("age","workclass","fnlwgt","education","education_num","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country","income_level")

Tidy Up and Prepare Data

Let’s tidy up the data

#Drop columns that are not required
income$fnlwgt=NULL
#Attach the dataframe for easy reference
attach(income)
#Check for missing values that may cause problem. Dataset has unknown values i.e a level for ?. But that is fine as it would just be considered as a category
table(complete.cases(income))
## 
##  TRUE 
## 32561
#There are no missing values
#Is capital-gain and capital-loss are exclusive? If not net gain/loss would be useful
any(capital_gain!=0 & capital_loss!=0)
## [1] FALSE
#Let's create a variable which captures if the person is involved in capital market using the capital gain and loss variable
income$capital_market = capital_gain !=0 | capital_loss != 0

Visualization

Let’s plot some graph to better understand the data and draw some information

Let’s check education level

boxplot(education_num~income_level,outline=F,xlab="Income Level",ylab="Education Level",main="Income Vs Education")

Let’s check if participation in capital market

prop.table(table(income_level,income$capital_market),2)
##             
## income_level     FALSE      TRUE
##        <=50K 0.8097070 0.4209407
##        >50K  0.1902930 0.5790593
# 57.9% people participating in capital market earn more than 50k

Let’s Plot all categorical variables…

library(ggplot2)
for  (i in 1:ncol(income)-2) {
  if (is.factor(income[,i])){
    pl =ggplot(income,aes_string(colnames(income)[i],fill="income_level"))+geom_bar(position="dodge") + theme(axis.text.x=element_text(angle=70))
    print(pl)
  }
  
}