This page provides simple analysis of census data. The dataset used in the page is sourced from https://archive.ics.uci.edu/ml/datasets/Adult.
Let’s read and examine the data.
file_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
income=read.table(file=file_url,sep=",")
str(income)
## 'data.frame': 32561 obs. of 15 variables:
## $ V1 : int 39 50 38 53 28 37 49 52 31 42 ...
## $ V2 : Factor w/ 9 levels " ?"," Federal-gov",..: 8 7 5 5 5 5 5 7 5 5 ...
## $ V3 : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ V4 : Factor w/ 16 levels " 10th"," 11th",..: 10 10 12 2 10 13 7 12 13 10 ...
## $ V5 : int 13 13 9 7 13 14 5 9 14 13 ...
## $ V6 : Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
## $ V7 : Factor w/ 15 levels " ?"," Adm-clerical",..: 2 5 7 7 11 5 9 5 11 5 ...
## $ V8 : Factor w/ 6 levels " Husband"," Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
## $ V9 : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
## $ V10: Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 1 1 2 1 2 ...
## $ V11: int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ V12: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V13: int 40 13 40 40 40 40 16 45 50 40 ...
## $ V14: Factor w/ 42 levels " ?"," Cambodia",..: 40 40 40 40 6 40 24 40 40 40 ...
## $ V15: Factor w/ 2 levels " <=50K"," >50K": 1 1 1 1 1 1 1 2 2 2 ...
Name the variables
colnames(income)<-c("age","workclass","fnlwgt","education","education_num","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country","income_level")
Let’s tidy up the data
#Drop columns that are not required
income$fnlwgt=NULL
#Attach the dataframe for easy reference
attach(income)
#Check for missing values that may cause problem. Dataset has unknown values i.e a level for ?. But that is fine as it would just be considered as a category
table(complete.cases(income))
##
## TRUE
## 32561
#There are no missing values
#Is capital-gain and capital-loss are exclusive? If not net gain/loss would be useful
any(capital_gain!=0 & capital_loss!=0)
## [1] FALSE
#Let's create a variable which captures if the person is involved in capital market using the capital gain and loss variable
income$capital_market = capital_gain !=0 | capital_loss != 0
Let’s plot some graph to better understand the data and draw some information
Let’s check education level
boxplot(education_num~income_level,outline=F,xlab="Income Level",ylab="Education Level",main="Income Vs Education")
Let’s check if participation in capital market
prop.table(table(income_level,income$capital_market),2)
##
## income_level FALSE TRUE
## <=50K 0.8097070 0.4209407
## >50K 0.1902930 0.5790593
# 57.9% people participating in capital market earn more than 50k
Let’s Plot all categorical variables…
library(ggplot2)
for (i in 1:ncol(income)-2) {
if (is.factor(income[,i])){
pl =ggplot(income,aes_string(colnames(income)[i],fill="income_level"))+geom_bar(position="dodge") + theme(axis.text.x=element_text(angle=70))
print(pl)
}
}