library(magrittr)
library(readr)
## Warning: package 'readr' was built under R version 3.4.1
library(d3heatmap)
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(RColorBrewer)
pal = rev(brewer.pal(5, "PiYG"))

cx = read_csv("data/01.2011_Census_Microdata.csv")
## Parsed with column specification:
## cols(
##   `Person ID` = col_integer(),
##   Region = col_character(),
##   `Residence Type` = col_character(),
##   `Family Composition` = col_integer(),
##   `Population Base` = col_integer(),
##   Sex = col_integer(),
##   Age = col_integer(),
##   `Marital Status` = col_integer(),
##   Student = col_integer(),
##   `Country of Birth` = col_integer(),
##   Health = col_integer(),
##   `Ethnic Group` = col_integer(),
##   Religion = col_integer(),
##   `Economic Activity` = col_integer(),
##   Occupation = col_integer(),
##   Industry = col_integer(),
##   `Hours worked per week` = col_integer(),
##   `Approximated Social Grade` = col_integer()
## )
attr(cx, "spec") = NULL
names(cx) = c(
  "id","region","residence","family","population","sex","age",
  "marital","student","country","unhealth","ethnic","religion",
  "economic","occupation","industry","hours","grade")
# a quick check
cx$region %<>% factor
par(mar=c(6,5,5,2),cex=0.8)
table(cx$region) %>%  barplot(las=2, main="IDs by Regions")

# factoring ...
cx$residence %<>% factor
cx$family %<>% factor
cx$population %<>% factor
cx$sex %<>% factor
cx$marital %<>% factor
cx$student %<>% factor
cx$country %<>% factor
cx$ethnic %<>% factor
cx$religion %<>% factor
cx$economic %<>% factor
cx$occupation %<>% factor
cx$industry %<>% factor
cx$grade %<>% factor
summary(cx)
##        id                region       residence  family      population
##  Min.   :7394483   E12000008: 88084   C: 10654   -9: 18851   1:561040  
##  1st Qu.:7536918   E12000007: 83582   H:559087   1 : 96690   2:  6730  
##  Median :7679353   E12000002: 71436              2 :300962   3:  1971  
##  Mean   :7679353   E12000006: 59411              3 : 72641             
##  3rd Qu.:7821788   E12000005: 56875              4 :  9848             
##  Max.   :7964223   E12000009: 53774              5 : 64519             
##                    (Other)  :156579              6 :  6230             
##  sex             age        marital    student    country    
##  1:280569   Min.   :1.000   1:270999   1:126537   -9:  6804  
##  2:289172   1st Qu.:2.000   2:214180   2:443204   1 :485645  
##             Median :4.000   3: 11951              2 : 77292  
##             Mean   :3.979   4: 40713                         
##             3rd Qu.:6.000   5: 31898                         
##             Max.   :8.000                                    
##                                                              
##     unhealth      ethnic         religion         economic     
##  Min.   :-9.000   -9:  6804   2      :333481   1      :216025  
##  1st Qu.: 1.000   1 :483477   1      :141658   -9     :112618  
##  Median : 2.000   2 : 12209   9      : 40613   5      : 97480  
##  Mean   : 1.658   3 : 42712   6      : 27240   2      : 40632  
##  3rd Qu.: 2.000   4 : 18786   4      :  8214   6      : 24756  
##  Max.   : 5.000   5 :  5753   -9     :  6804   3      : 18109  
##                               (Other): 11731   (Other): 60121  
##    occupation        industry          hours        grade      
##  -9     :149984   -9     :149984   Min.   :-9.000   -9:124103  
##  2      : 64111   4      : 68878   1st Qu.:-9.000   1 : 82320  
##  9      : 58483   2      : 53433   Median :-9.000   2 :159642  
##  4      : 53254   8      : 49960   Mean   :-3.487   3 : 79936  
##  5      : 48546   11     : 49345   3rd Qu.: 3.000   4 :123740  
##  3      : 44937   10     : 40560   Max.   : 4.000              
##  (Other):150426   (Other):157581
# re-leveling for clearity ...
levels(cx$region) %<>% {gsub("00000","_",.)}
levels(cx$economic) %<>% {paste0("ECO",.)}  
levels(cx$industry) %<>% {paste0("IND",.)}  
levels(cx$occupation) %<>% {paste0("OCC",.)} 
# actually we can do cluster oncategorial variables, but
# it takes a very very long time,
# and the result is not very useful

# this would take about 3 hours
# library(klaR)
# install.packages("klaR")
# km = kmodes(cx[,-c(1,7,11,17)], 4) 

As a better approach, we can use the heatmap method to do clustering and do cross-analysis across categories. For examples, if we are interested in the health condition across occupations, industry and regions …

tapply(cx$unhealth, list(cx$occupation, cx$industry), mean)[-1,-1] %>% 
  d3heatmap(col=pal)
tapply(cx$unhealth, list(cx$occupation, cx$region), mean)[-1,] %>% 
  d3heatmap(col=pal)
tapply(cx$unhealth, list(cx$industry, cx$region), mean)[-1,] %>% 
  d3heatmap(col=pal)

What have you seen, in the figures above?


Maybe we also want to know how age is related to health …

table(cx$age) %>% barplot

tapply(cx$unhealth, cx$age, mean)%>% barplot

So, the older have poor health in general. But, is this relationship hold in every circumstances?

aggregate(unhealth ~ age + economic + industry, data=cx, mean) %>% 
  filter(industry != "IND-9") %>% 
  ggplot(aes(x=age, y=unhealth, fill=economic)) +
  geom_area(stat="identity", alpha=0.4) +
  facet_grid(economic ~ industry) +
  theme(legend.position="none")
## Warning: package 'bindrcpp' was built under R version 3.4.1


We can find interesting things by visualization. However, in order to win the competition, we need do more. It would be a big plue, if we could pull in the cancer data and try to do some risk analysis.