TODO:

Preliminary database loading

## [1] "dplyr"   "langcog" "tidyr"   "ggplot2" "lme4"
## 
## Attaching package: 'langcog'
## 
## The following object is masked from 'package:base':
## 
##     scale
## 
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Loading required package: grid
## Loading required package: quadprog

Connect to database.

Let’s first look at the table structure.

activity_list <- tbl(kinedu, "activities")
babies <- tbl(kinedu, "babies")
activities <- tbl(kinedu, "user_activities")
users <- tbl(kinedu, "users")

Looks like we want to link babies to user_activities via activities.

Activities

Extract database of activities and reporting dates

Let’s start by filtering out the babies that have fewer than say 5 activities.

baby_activity_quantity <- activities %>%
  group_by(baby_id) %>%
  filter(!is.na(baby_id)) %>%
  summarize(n = n()) %>%
  data.frame

qplot(n, data = filter(baby_activity_quantity)) + 
  scale_x_log10()

Now get baby info.

active_babies <- filter(baby_activity_quantity, n > 5)$baby_id

baby_info <- babies %>% 
  select(id, birthday, gender) %>%
  collect %>%
  mutate(baby_id = id, 
         dob = ymd(birthday)) %>%
  select(-birthday, -id)

Join baby info into activities. (Note I’m ignoring update dates here).

activity <- activities %>%
  filter(baby_id %in% active_babies) %>%
  select(activity_id, baby_id, created_at) %>%
  collect %>%
  mutate(activity_time = ymd_hms(created_at))%>%
  select(-created_at) %>%
  left_join(baby_info)

activity <- activity %>% 
  mutate(age = duration(new_interval(dob, activity_time)) / dyears(1))

Now we can look at the histogram of ages at which activities were recorded!

qplot(age, data=activity) + 
  xlim(c(-1, 3))

Some prenatal activities in there that we should filter. Let’s also stop at three.

activity <- activity %>%
  filter(age > 0 & age < 3)

Get activity info.

activity_info <- activity_list %>%
  select(id, name, description) %>%
  collect 

activity_info$name <- sapply(activity_info$name, 
                                function(x) {str_split(x, ",")[[1]][1]})
activity_info$name <- str_sub(sapply(activity_info$name, 
                                function(x) {str_split(x, ":")[[1]][2]}), 
                              start = 2, end = -2)

activity_info <- activity_info %>%
  mutate(activity_id = id) %>%
  select(-id)

Now merge infos.

d <- left_join(activity, select(activity_info, -description)) %>%
  filter(!is.na(name))

Now we have a dataset of about 100k activities with babies who entered more than 5.

Exploration of activities

Get the top 50 most frequent activities.

activity_freq <- d %>%
  group_by(name) %>%
  summarise(n = n(), 
            age = mean(age)) %>%
  arrange(desc(n))

qplot(n, data = activity_freq) + 
  scale_x_log10()

qplot(age, data = activity_freq) 

Now get the age histogram for the 50 most frequent activities.

frequent_activities <- filter(activity_freq, n > 750)$name

activity_dates <- d %>% 
  filter(name %in% frequent_activities) %>%
  group_by(name) %>%
  summarise(age = mean(age)) %>%
  arrange(age)

d.freq <- d %>% 
  filter(name %in% frequent_activities) %>%
  mutate(name = factor(name, levels = activity_dates$name))

And plot.

ggplot(d.freq, 
       aes(x = age)) + 
  geom_histogram() +
  facet_wrap(~name) +
  xlim(c(0,1.5))

Indicators

Extract assessments/indicators and join

Get indicators data.

indicator_cats <- tbl(kinedu, "indicator_categories") %>%
  mutate(indicator_category_id = id) %>%
  collect %>%
  data.frame

indicator_cats$name <- sapply(indicator_cats$name, 
                                function(x) {str_split(x, ",")[[1]][1]})
indicator_cats$cat_name <- str_sub(sapply(indicator_cats$name, 
                                function(x) {str_split(x, ":")[[1]][2]}), 
                              start = 2, end = -2)

indicator_cats <- indicator_cats %>% 
  select(indicator_category_id, cat_name)
  
indicator_info <- tbl(kinedu, "indicators") %>%
  mutate(indicator_id = id) %>%
  select(indicator_id, description, indicator_category_id) %>%
  collect %>% 
  data.frame %>%
  left_join(indicator_cats)
indicators <- tbl(kinedu, "indicator_answers") 

Explore indicator data - looks like this table may be populated with null answers for many indicators. Most have a tried_count of 0.

tc <- indicators %>% 
  group_by(tried_count) %>%
  summarise(n = n()) %>%
  collect

qplot(tried_count, n, data = tc)

So tried_count looks like it’s the number of times a particular indicator was tried (shown to user)? Essentially this is almost always 0.

indicators %>%
  group_by(tried_count) %>%
  summarise(n = n(), 
            answer = sum(answer == 1))
## Source: mysql 5.5.41-MariaDB-log [reader@db.kinedu.com:/kinedu_app]
## From: <derived table> [?? x 3]
## 
##    tried_count        n   answer
##          (int)    (dbl)    (dbl)
## 1            0 20588543 12593620
## 2            1    33335    22542
## 3            2     4417     3128
## 4            3     1158      848
## 5            4      364      274
## 6            5      158      114
## 7            6      102       70
## 8            7       69       48
## 9            8       36       25
## 10           9       23       13
## ..         ...      ...      ...

But this is kind of confusing, because we see that there are many answer==1 cases for tried_count==0. Not sure what to make of that. For now let’s filter by tried_count because I suspect that the others are autofilled - and in any case I can’t actually process 20M rows that easily. (It will take a bit of work, at least).

indicators <- indicators %>%
  filter(!is.na(baby_id), 
         tried_count > 0) %>%
  select(indicator_id, baby_id, updated_at,  
         tried_count, answer) %>%
  data.frame %>%
  mutate(indicator_time = ymd_hms(updated_at))%>%
  select(-updated_at)

Now join in other info.

d <- indicators %>%
  left_join(baby_info) %>%
  left_join(indicator_info) %>%
  mutate(age = duration(new_interval(dob, indicator_time)) / dyears(1))

Make short descriptions.

d$short_desc <- sapply(d$description, 
                                function(x) {str_split(x, ",")[[1]][1]})
d$short_desc <- str_sub(sapply(d$short_desc, 
                                function(x) {str_split(x, ":")[[1]][2]}), 
                              start = 2, end = -2)
d$short_desc <- str_sub(d$short_desc, start = 1, end = 20)

Exploration of indicators

Get the most frequent indicators.

indicator_freq <- d %>%
  filter(!is.na(age)) %>%
  group_by(short_desc) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

qplot(n, data = indicator_freq) + 
  scale_x_log10() 

This looks like a reasonable distribution, which is comforting. When I did this same thing with all answers (e.g. no tried_count filter) the distribution was very odd.

Now what we want to do is to calculate the probability for each of the indicators.

frequent_indicators <- filter(indicator_freq, n > 200)$short_desc

indicator_summary <- d %>% 
  filter(short_desc %in% frequent_indicators) %>%
  mutate(round_age = floor(age * 12)/12) %>%
  filter(round_age >= 0 & round_age < 2) %>%
  collect %>% 
  group_by(short_desc, cat_name, round_age) %>% 
  summarise(mean = mean(answer))

indicator_totals <- d %>% 
  filter(short_desc %in% frequent_indicators) %>%
  collect %>% 
  group_by(short_desc, cat_name) %>% 
  summarise(prop = mean(answer==1)) %>% 
  arrange(desc(prop))

indicator_summary$short_desc_ord <- factor(indicator_summary$short_desc, 
                                           levels = indicator_totals$short_desc)

And plot. Note that categories are not mutually exclusive…

ggplot(indicator_summary, 
       aes(x = round_age, y = mean, col = short_desc)) + 
  geom_point() +
  geom_smooth(span = 2, se=FALSE) + 
  facet_wrap(~cat_name) + 
  scale_colour_solarized(guide=FALSE) + 
  xlim(c(0,3)) + 
  geom_text(data = indicator_summary %>%
              group_by(short_desc, cat_name) %>% 
              summarize(mean = min(mean), 
                        round_age = max(round_age) + .1), 
            aes(label = short_desc), cex = 3.5,
            hjust = 0)