library(magrittr)
library(readr)
## Warning: package 'readr' was built under R version 3.4.1
library(d3heatmap)
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(RColorBrewer)
pal = rev(brewer.pal(5, "PiYG"))
cx = read_csv("data/01.2011_Census_Microdata.csv")
## Parsed with column specification:
## cols(
## `Person ID` = col_integer(),
## Region = col_character(),
## `Residence Type` = col_character(),
## `Family Composition` = col_integer(),
## `Population Base` = col_integer(),
## Sex = col_integer(),
## Age = col_integer(),
## `Marital Status` = col_integer(),
## Student = col_integer(),
## `Country of Birth` = col_integer(),
## Health = col_integer(),
## `Ethnic Group` = col_integer(),
## Religion = col_integer(),
## `Economic Activity` = col_integer(),
## Occupation = col_integer(),
## Industry = col_integer(),
## `Hours worked per week` = col_integer(),
## `Approximated Social Grade` = col_integer()
## )
attr(cx, "spec") = NULL
names(cx) = c(
"id","region","residence","family","population","sex","age",
"marital","student","country","unhealth","ethnic","religion",
"economic","occupation","industry","hours","grade")
# a quick check
cx$region %<>% factor
par(mar=c(6,5,5,2),cex=0.8)
table(cx$region) %>% barplot(las=2, main="IDs by Regions")
# factoring ...
cx$residence %<>% factor
cx$family %<>% factor
cx$population %<>% factor
cx$sex %<>% factor
cx$marital %<>% factor
cx$student %<>% factor
cx$country %<>% factor
cx$ethnic %<>% factor
cx$religion %<>% factor
cx$economic %<>% factor
cx$occupation %<>% factor
cx$industry %<>% factor
cx$grade %<>% factor
summary(cx)
## id region residence family population
## Min. :7394483 E12000008: 88084 C: 10654 -9: 18851 1:561040
## 1st Qu.:7536918 E12000007: 83582 H:559087 1 : 96690 2: 6730
## Median :7679353 E12000002: 71436 2 :300962 3: 1971
## Mean :7679353 E12000006: 59411 3 : 72641
## 3rd Qu.:7821788 E12000005: 56875 4 : 9848
## Max. :7964223 E12000009: 53774 5 : 64519
## (Other) :156579 6 : 6230
## sex age marital student country
## 1:280569 Min. :1.000 1:270999 1:126537 -9: 6804
## 2:289172 1st Qu.:2.000 2:214180 2:443204 1 :485645
## Median :4.000 3: 11951 2 : 77292
## Mean :3.979 4: 40713
## 3rd Qu.:6.000 5: 31898
## Max. :8.000
##
## unhealth ethnic religion economic
## Min. :-9.000 -9: 6804 2 :333481 1 :216025
## 1st Qu.: 1.000 1 :483477 1 :141658 -9 :112618
## Median : 2.000 2 : 12209 9 : 40613 5 : 97480
## Mean : 1.658 3 : 42712 6 : 27240 2 : 40632
## 3rd Qu.: 2.000 4 : 18786 4 : 8214 6 : 24756
## Max. : 5.000 5 : 5753 -9 : 6804 3 : 18109
## (Other): 11731 (Other): 60121
## occupation industry hours grade
## -9 :149984 -9 :149984 Min. :-9.000 -9:124103
## 2 : 64111 4 : 68878 1st Qu.:-9.000 1 : 82320
## 9 : 58483 2 : 53433 Median :-9.000 2 :159642
## 4 : 53254 8 : 49960 Mean :-3.487 3 : 79936
## 5 : 48546 11 : 49345 3rd Qu.: 3.000 4 :123740
## 3 : 44937 10 : 40560 Max. : 4.000
## (Other):150426 (Other):157581
# re-leveling for clearity ...
levels(cx$region) %<>% {gsub("00000","_",.)}
levels(cx$economic) %<>% {paste0("ECO",.)}
levels(cx$industry) %<>% {paste0("IND",.)}
levels(cx$occupation) %<>% {paste0("OCC",.)}
# actually we can do cluster oncategorial variables, but
# it takes a very very long time,
# and the result is not very useful
# this would take about 3 hours
# library(klaR)
# install.packages("klaR")
# km = kmodes(cx[,-c(1,7,11,17)], 4)
As a better approach, we can use the heatmap method to do clustering and do cross-analysis across categories. For examples, if we are interested in the health condition across occupations, industry and regions …
tapply(cx$unhealth, list(cx$occupation, cx$industry), mean)[-1,-1] %>%
d3heatmap(col=pal)
tapply(cx$unhealth, list(cx$occupation, cx$region), mean)[-1,] %>%
d3heatmap(col=pal)
tapply(cx$unhealth, list(cx$industry, cx$region), mean)[-1,] %>%
d3heatmap(col=pal)
What have you seen, in the figures above?
Maybe we also want to know how age is related to health …
table(cx$age) %>% barplot
tapply(cx$unhealth, cx$age, mean)%>% barplot
So, the older have poor health in general. But, is this relationship hold in every circumstances?
aggregate(unhealth ~ age + economic + industry, data=cx, mean) %>%
filter(industry != "IND-9") %>%
ggplot(aes(x=age, y=unhealth, fill=economic)) +
geom_area(stat="identity", alpha=0.4) +
facet_grid(economic ~ industry) +
theme(legend.position="none")
## Warning: package 'bindrcpp' was built under R version 3.4.1
We can find interesting things by visualization. However, in order to win the competition, we need do more. It would be a big plue, if we could pull in the cancer data and try to do some risk analysis.