TODO:
## [1] "dplyr" "langcog" "tidyr" "ggplot2" "lme4"
##
## Attaching package: 'langcog'
##
## The following object is masked from 'package:base':
##
## scale
##
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Loading required package: grid
## Loading required package: quadprog
Connect to database.
Let’s first look at the table structure.
activity_list <- tbl(kinedu, "activities")
babies <- tbl(kinedu, "babies")
activities <- tbl(kinedu, "user_activities")
users <- tbl(kinedu, "users")
Looks like we want to link babies
to user_activities
via activities
.
Let’s start by filtering out the babies that have fewer than say 5 activities.
baby_activity_quantity <- activities %>%
group_by(baby_id) %>%
filter(!is.na(baby_id)) %>%
summarize(n = n()) %>%
data.frame
qplot(n, data = filter(baby_activity_quantity)) +
scale_x_log10()
Now get baby info.
active_babies <- filter(baby_activity_quantity, n > 5)$baby_id
baby_info <- babies %>%
select(id, birthday, gender) %>%
collect %>%
mutate(baby_id = id,
dob = ymd(birthday)) %>%
select(-birthday, -id)
Join baby info into activities. (Note I’m ignoring update dates here).
activity <- activities %>%
filter(baby_id %in% active_babies) %>%
select(activity_id, baby_id, created_at) %>%
collect %>%
mutate(activity_time = ymd_hms(created_at))%>%
select(-created_at) %>%
left_join(baby_info)
activity <- activity %>%
mutate(age = duration(new_interval(dob, activity_time)) / dyears(1))
Now we can look at the histogram of ages at which activities were recorded!
qplot(age, data=activity) +
xlim(c(-1, 3))
Some prenatal activities in there that we should filter. Let’s also stop at three.
activity <- activity %>%
filter(age > 0 & age < 3)
Get activity info.
activity_info <- activity_list %>%
select(id, name, description) %>%
collect
activity_info$name <- sapply(activity_info$name,
function(x) {str_split(x, ",")[[1]][1]})
activity_info$name <- str_sub(sapply(activity_info$name,
function(x) {str_split(x, ":")[[1]][2]}),
start = 2, end = -2)
activity_info <- activity_info %>%
mutate(activity_id = id) %>%
select(-id)
Now merge infos.
d <- left_join(activity, select(activity_info, -description)) %>%
filter(!is.na(name))
Now we have a dataset of about 100k activities with babies who entered more than 5.
Get the top 50 most frequent activities.
activity_freq <- d %>%
group_by(name) %>%
summarise(n = n(),
age = mean(age)) %>%
arrange(desc(n))
qplot(n, data = activity_freq) +
scale_x_log10()
qplot(age, data = activity_freq)
Now get the age histogram for the 50 most frequent activities.
frequent_activities <- filter(activity_freq, n > 750)$name
activity_dates <- d %>%
filter(name %in% frequent_activities) %>%
group_by(name) %>%
summarise(age = mean(age)) %>%
arrange(age)
d.freq <- d %>%
filter(name %in% frequent_activities) %>%
mutate(name = factor(name, levels = activity_dates$name))
And plot.
ggplot(d.freq,
aes(x = age)) +
geom_histogram() +
facet_wrap(~name) +
xlim(c(0,1.5))
Get indicators data.
indicator_cats <- tbl(kinedu, "indicator_categories") %>%
mutate(indicator_category_id = id) %>%
collect %>%
data.frame
indicator_cats$name <- sapply(indicator_cats$name,
function(x) {str_split(x, ",")[[1]][1]})
indicator_cats$cat_name <- str_sub(sapply(indicator_cats$name,
function(x) {str_split(x, ":")[[1]][2]}),
start = 2, end = -2)
indicator_cats <- indicator_cats %>%
select(indicator_category_id, cat_name)
indicator_info <- tbl(kinedu, "indicators") %>%
mutate(indicator_id = id) %>%
select(indicator_id, description, indicator_category_id) %>%
collect %>%
data.frame %>%
left_join(indicator_cats)
indicators <- tbl(kinedu, "indicator_answers")
Explore indicator data - looks like this table may be populated with null answers for many indicators. Most have a tried_count
of 0.
tc <- indicators %>%
group_by(tried_count) %>%
summarise(n = n()) %>%
collect
qplot(tried_count, n, data = tc)
So tried_count
looks like it’s the number of times a particular indicator was tried (shown to user)? Essentially this is almost always 0.
indicators %>%
group_by(tried_count) %>%
summarise(n = n(),
answer = sum(answer == 1))
## Source: mysql 5.5.41-MariaDB-log [reader@db.kinedu.com:/kinedu_app]
## From: <derived table> [?? x 3]
##
## tried_count n answer
## (int) (dbl) (dbl)
## 1 0 20588543 12593620
## 2 1 33335 22542
## 3 2 4417 3128
## 4 3 1158 848
## 5 4 364 274
## 6 5 158 114
## 7 6 102 70
## 8 7 69 48
## 9 8 36 25
## 10 9 23 13
## .. ... ... ...
But this is kind of confusing, because we see that there are many answer==1
cases for tried_count==0
. Not sure what to make of that. For now let’s filter by tried_count
because I suspect that the others are autofilled - and in any case I can’t actually process 20M rows that easily. (It will take a bit of work, at least).
indicators <- indicators %>%
filter(!is.na(baby_id),
tried_count > 0) %>%
select(indicator_id, baby_id, updated_at,
tried_count, answer) %>%
data.frame %>%
mutate(indicator_time = ymd_hms(updated_at))%>%
select(-updated_at)
Now join in other info.
d <- indicators %>%
left_join(baby_info) %>%
left_join(indicator_info) %>%
mutate(age = duration(new_interval(dob, indicator_time)) / dyears(1))
Make short descriptions.
d$short_desc <- sapply(d$description,
function(x) {str_split(x, ",")[[1]][1]})
d$short_desc <- str_sub(sapply(d$short_desc,
function(x) {str_split(x, ":")[[1]][2]}),
start = 2, end = -2)
d$short_desc <- str_sub(d$short_desc, start = 1, end = 20)
Get the most frequent indicators.
indicator_freq <- d %>%
filter(!is.na(age)) %>%
group_by(short_desc) %>%
summarise(n = n()) %>%
arrange(desc(n))
qplot(n, data = indicator_freq) +
scale_x_log10()
This looks like a reasonable distribution, which is comforting. When I did this same thing with all answers (e.g. no tried_count
filter) the distribution was very odd.
Now what we want to do is to calculate the probability for each of the indicators.
frequent_indicators <- filter(indicator_freq, n > 200)$short_desc
indicator_summary <- d %>%
filter(short_desc %in% frequent_indicators) %>%
mutate(round_age = floor(age * 12)/12) %>%
filter(round_age >= 0 & round_age < 2) %>%
collect %>%
group_by(short_desc, cat_name, round_age) %>%
summarise(mean = mean(answer))
indicator_totals <- d %>%
filter(short_desc %in% frequent_indicators) %>%
collect %>%
group_by(short_desc, cat_name) %>%
summarise(prop = mean(answer==1)) %>%
arrange(desc(prop))
indicator_summary$short_desc_ord <- factor(indicator_summary$short_desc,
levels = indicator_totals$short_desc)
And plot. Note that categories are not mutually exclusive…
ggplot(indicator_summary,
aes(x = round_age, y = mean, col = short_desc)) +
geom_point() +
geom_smooth(span = 2, se=FALSE) +
facet_wrap(~cat_name) +
scale_colour_solarized(guide=FALSE) +
xlim(c(0,3)) +
geom_text(data = indicator_summary %>%
group_by(short_desc, cat_name) %>%
summarize(mean = min(mean),
round_age = max(round_age) + .1),
aes(label = short_desc), cex = 3.5,
hjust = 0)