.libPaths(c("/home/rstudioshared", "/home/rstudioshared/packages",
            "/home/rstudioshared/shared_files/packages"))

profiles <- read.csv('/home/rstudioshared/shared_files/data/profiles_removed_ages.csv')
library(dplyr); library(ggplot2)

The Data

This .csv file has data from nearly 60,000 Ok Cupid profiles. I removed the essays (since that made the file enormous) so we are left with demographic data and short answers.

The Challenge

Half of these profiles (the test set) are missing ages. Your challenge is to predict their ages (or claimed ages) as accurately as possible based on the other answers in the profile. You will then submit your age estimates to me and I will act as Kaggle, awarding a class winner based on the lowest root mean square error (RMSE).

A Summary of the Data

summary(profiles)
##        X            set              ID             age        
##  Min.   :    1   test :29973   Min.   :    1   Min.   :  0.00  
##  1st Qu.:14987   train:29973   1st Qu.:14987   1st Qu.:  0.00  
##  Median :29974                 Median :29974   Median :  9.00  
##  Mean   :29974                 Mean   :29974   Mean   : 16.18  
##  3rd Qu.:44960                 3rd Qu.:44960   3rd Qu.: 30.00  
##  Max.   :59946                 Max.   :59946   Max.   :110.00  
##                                                                
##     body_type                    diet               drinks     
##  average :14652   mostly anything  :16585   desperately:  322  
##  fit     :12711   anything         : 6183   not at all : 3267  
##  athletic:11819   strictly anything: 5113   often      : 5164  
##  thin    : 4711   mostly vegetarian: 3444   rarely     : 5957  
##  curvy   : 3924   mostly other     : 1007   socially   :41780  
##  (Other) : 6833   (Other)          : 3219   very often :  471  
##  NA's    : 5296   NA's             :24395   NA's       : 2985  
##        drugs                                   education    
##  never    :37724   graduated from college/university:23959  
##  often    :  410   graduated from masters program   : 8961  
##  sometimes: 7732   working on college/university    : 5712  
##  NA's     :14080   working on masters program       : 1683  
##                    graduated from two-year college  : 1531  
##                    (Other)                          :11472  
##                    NA's                             : 6628  
##             ethnicity         height         income       
##  white           :32828   Min.   : 1.0   Min.   :  20000  
##  asian           : 6134   1st Qu.:66.0   1st Qu.:  20000  
##  hispanic / latin: 2823   Median :68.0   Median :  50000  
##  black           : 2008   Mean   :68.3   Mean   : 104401  
##  other           : 1706   3rd Qu.:71.0   3rd Qu.: 100000  
##  (Other)         : 8764   Max.   :95.0   Max.   :1000000  
##  NA's            : 5683   NA's   :6      NA's   :48443    
##                              job            last_online   
##                                : 8201   2012-06-30:13753  
##  other                         : 7588   2012-06-29: 8440  
##  student                       : 4882   2012-06-28: 3981  
##  science / tech / engineering  : 4848   2012-06-27: 2743  
##  computer / hardware / software: 4709   2012-07-01: 2090  
##  artistic / musical / writer   : 4438   (Other)   :28936  
##  (Other)                       :25280   NA's      :    3  
##                       location    
##  san francisco, california:31064  
##  oakland, california      : 7214  
##  berkeley, california     : 4210  
##  san mateo, california    : 1331  
##  palo alto, california    : 1064  
##  alameda, california      :  910  
##  (Other)                  :14153  
##                                   offspring       orientation   
##  doesn't have kids                     : 7559   bisexual: 2767  
##  doesn't have kids, but might want them: 3875   gay     : 5573  
##  doesn't have kids, but wants them     : 3565   straight:51603  
##  doesn't want kids                     : 2926   NA's    :    3  
##  has kids                              : 1883                   
##  (Other)                               : 4575                   
##  NA's                                  :35563                   
##                         pets      
##  likes dogs and likes cats:14813  
##  likes dogs               : 7224  
##  likes dogs and has cats  : 4313  
##  has dogs                 : 4133  
##  has dogs and likes cats  : 2333  
##  (Other)                  : 7207  
##  NA's                     :19923  
##                                      religion       sex       
##  agnosticism                             : 2723   f   :24117  
##  other                                   : 2691   m   :35826  
##  agnosticism but not too serious about it: 2636   NA's:    3  
##  agnosticism and laughing about it       : 2496               
##  catholicism but not too serious about it: 2318               
##  (Other)                                 :26853               
##  NA's                                    :20229               
##                                   sign                  smokes     
##  gemini and it's fun to think about : 1782   no            :43893  
##  scorpio and it's fun to think about: 1772   sometimes     : 3787  
##  leo and it's fun to think about    : 1692   trying to quit: 1480  
##  libra and it's fun to think about  : 1648   when drinking : 3040  
##  taurus and it's fun to think about : 1640   yes           : 2231  
##  (Other)                            :40353   NA's          : 5515  
##  NA's                               :11059                         
##                                     speaks                 status     
##  english                               :21828   available     : 1864  
##  english (fluently)                    : 6627   married       :  310  
##  english (fluently), spanish (poorly)  : 2059   seeing someone: 2064  
##  english (fluently), spanish (okay)    : 1917   single        :55695  
##  english (fluently), spanish (fluently): 1288   unknown       :   10  
##  english, spanish                      :  859   NA's          :    3  
##  (Other)                               :25368

Like most real data, this data set is a bit messy (in fact, relatively speaking this is quite neat). There are lots missing data points - folks who didn’t enter their income for instance, and many of the variables are not coded in the ideal way for a regression model.

Splitting into training and test sets

train <- filter(profiles, set=="train")
test <- filter(profiles, set=="test")

A Not-quite-so-simple Income model

m <- lm(age~income, data=train)
summary(m)
## 
## Call:
## lm(formula = age ~ income, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.658  -7.458  -2.652   5.342  36.354 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.266e+01  1.473e-01 221.778   <2e-16 ***
## income      -2.045e-07  6.552e-07  -0.312    0.755    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.89 on 5728 degrees of freedom
##   (24243 observations deleted due to missingness)
## Multiple R-squared:  1.7e-05,    Adjusted R-squared:  -0.0001576 
## F-statistic: 0.09739 on 1 and 5728 DF,  p-value: 0.755

At first blush, income does not appear to be a significant predictor of age but that’s simply because we are utilizing it in a poor way. Instead, let’s try to predict age using the log of income (I’ll use log base 10 for easier interpreation):

m <- lm(age~log(income, base=10), data=train)
summary(m)
## 
## Call:
## lm(formula = age ~ log(income, base = 10), data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.343  -6.496  -2.496   4.592  38.713 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              7.3620     1.5122   4.868 1.16e-06 ***
## log(income, base = 10)   5.3301     0.3177  16.776  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.656 on 5728 degrees of freedom
##   (24243 observations deleted due to missingness)
## Multiple R-squared:  0.04683,    Adjusted R-squared:  0.04667 
## F-statistic: 281.4 on 1 and 5728 DF,  p-value: < 2.2e-16

Now income is easily statistically significant. What age does this predict for someone who reports $10,000 of income? What about for someone with $1,000,000 of income? What about for someone with $1 of income or $0 of income? Do these values seem reasonable?

Note that, using the log of income is far from our only choice. We could get creative:

m <- lm(age~I(1/(income+10000)), data=train)
summary(m)
## 
## Call:
## lm(formula = age ~ I(1/(income + 10000)), data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.072  -6.199  -2.554   4.041  40.982 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            3.838e+01  2.519e-01  152.38   <2e-16 ***
## I(1/(income + 10000)) -3.108e+05  1.189e+04  -26.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.348 on 5728 degrees of freedom
##   (24243 observations deleted due to missingness)
## Multiple R-squared:  0.1066, Adjusted R-squared:  0.1064 
## F-statistic: 683.5 on 1 and 5728 DF,  p-value: < 2.2e-16

Which of these two models has lower error in sample?

One problem with either model is that most of its predictions with be NA because most OkCupid users (roughly 48 thousand of the total 60 thousand users) did not income their incomes. Here are the first 40 predictions:

predict(m, train)[1:40]
##        1        2        3        4        5        6        7        8 
##       NA       NA       NA       NA       NA       NA       NA 33.19900 
##        9       10       11       12       13       14       15       16 
##       NA       NA       NA       NA       NA       NA       NA       NA 
##       17       18       19       20       21       22       23       24 
##       NA       NA       NA       NA       NA       NA       NA       NA 
##       25       26       27       28       29       30       31       32 
## 38.07191       NA       NA       NA       NA       NA       NA       NA 
##       33       34       35       36       37       38       39       40 
##       NA 28.01834       NA       NA 36.43692       NA       NA       NA

How do we work around this?

train$listsincome[!is.na(train$income)] <- 1
train$listsincome[is.na(train$income)] <- 0

train$logincome <- ifelse(train$listsincome, log(train$income, base=10),0)

m <- lm(age~logincome+listsincome, data=train)
summary(m)
## 
## Call:
## lm(formula = age ~ logincome + listsincome, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.343  -6.291  -2.291   4.049  77.709 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  32.29101    0.06073  531.75   <2e-16 ***
## logincome     5.33014    0.31112   17.13   <2e-16 ***
## listsincome -24.92898    1.48206  -16.82   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.455 on 29970 degrees of freedom
## Multiple R-squared:  0.009906,   Adjusted R-squared:  0.00984 
## F-statistic: 149.9 on 2 and 29970 DF,  p-value: < 2.2e-16
hist(predict(m, train))

hist(train$age)

Adding to the Model

levels(train$orientation)
## [1] "bisexual" "gay"      "straight"
m <- lm(age~logincome+listsincome+orientation, data=train)
summary(m)
## 
## Call:
## lm(formula = age ~ logincome + listsincome + orientation, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.314  -6.416  -2.416   4.563  77.584 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          29.4707     0.2566 114.872   <2e-16 ***
## logincome             5.1727     0.3109  16.640   <2e-16 ***
## listsincome         -24.1376     1.4808 -16.300   <2e-16 ***
## orientationgay        2.9571     0.3112   9.502   <2e-16 ***
## orientationstraight   2.9449     0.2612  11.274   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.434 on 29966 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.01412,    Adjusted R-squared:  0.01399 
## F-statistic: 107.3 on 4 and 29966 DF,  p-value: < 2.2e-16

Recoding Variables

Are Zodiac Signs “Fun”?

levels(profiles$sign)
##  [1] "aquarius"                               
##  [2] "aquarius and it matters a lot"          
##  [3] "aquarius and it's fun to think about"   
##  [4] "aquarius but it doesn't matter"         
##  [5] "aries"                                  
##  [6] "aries and it matters a lot"             
##  [7] "aries and it's fun to think about"      
##  [8] "aries but it doesn't matter"            
##  [9] "cancer"                                 
## [10] "cancer and it matters a lot"            
## [11] "cancer and it's fun to think about"     
## [12] "cancer but it doesn't matter"           
## [13] "capricorn"                              
## [14] "capricorn and it matters a lot"         
## [15] "capricorn and it's fun to think about"  
## [16] "capricorn but it doesn't matter"        
## [17] "gemini"                                 
## [18] "gemini and it matters a lot"            
## [19] "gemini and it's fun to think about"     
## [20] "gemini but it doesn't matter"           
## [21] "leo"                                    
## [22] "leo and it matters a lot"               
## [23] "leo and it's fun to think about"        
## [24] "leo but it doesn't matter"              
## [25] "libra"                                  
## [26] "libra and it matters a lot"             
## [27] "libra and it's fun to think about"      
## [28] "libra but it doesn't matter"            
## [29] "pisces"                                 
## [30] "pisces and it matters a lot"            
## [31] "pisces and it's fun to think about"     
## [32] "pisces but it doesn't matter"           
## [33] "sagittarius"                            
## [34] "sagittarius and it matters a lot"       
## [35] "sagittarius and it's fun to think about"
## [36] "sagittarius but it doesn't matter"      
## [37] "scorpio"                                
## [38] "scorpio and it matters a lot"           
## [39] "scorpio and it's fun to think about"    
## [40] "scorpio but it doesn't matter"          
## [41] "taurus"                                 
## [42] "taurus and it matters a lot"            
## [43] "taurus and it's fun to think about"     
## [44] "taurus but it doesn't matter"           
## [45] "virgo"                                  
## [46] "virgo and it matters a lot"             
## [47] "virgo and it's fun to think about"      
## [48] "virgo but it doesn't matter"
train$sign[1:20]
##  [1] gemini                             pisces but it doesn't matter      
##  [3] aquarius                           virgo                             
##  [5] gemini but it doesn't matter       taurus                            
##  [7] taurus                             taurus                            
##  [9] pisces but it doesn't matter       libra but it doesn't matter       
## [11] libra                              sagittarius but it doesn't matter 
## [13] scorpio and it matters a lot       leo and it's fun to think about   
## [15] <NA>                               gemini                            
## [17] libra                              sagittarius but it doesn't matter 
## [19] cancer and it's fun to think about libra                             
## 48 Levels: aquarius ... virgo but it doesn't matter
grepl("fun",train$sign)[1:20]
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE
train$sign_fun <- 0
train[grepl("fun",train$sign),"sign_fun"] <- 1
train[grepl("doesn't matter",train$sign),"sign_fun"] <- -1

m <- lm(age~logincome+listsincome+orientation+sign_fun, data=train)
summary(m)
## 
## Call:
## lm(formula = age ~ logincome + listsincome + orientation + sign_fun, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.617  -6.554  -2.409   4.319  77.591 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          29.4221     0.2568 114.571  < 2e-16 ***
## logincome             5.1985     0.3109  16.723  < 2e-16 ***
## listsincome         -24.2551     1.4808 -16.380  < 2e-16 ***
## orientationgay        2.9606     0.3111   9.515  < 2e-16 ***
## orientationstraight   2.9872     0.2614  11.429  < 2e-16 ***
## sign_fun              0.2720     0.0706   3.853 0.000117 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.432 on 29965 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.01461,    Adjusted R-squared:  0.01444 
## F-statistic: 88.83 on 5 and 29965 DF,  p-value: < 2.2e-16

A Little Cleanup?

train$height[train$height < 50] <- NA

train[grepl("student",train$job),"isstudent"] <- 1
train[!grepl("student",train$job),"isstudent"] <- 0

train[grepl("strictly",train$diet),"diet_strict"] <- 1
train[!grepl("strictly",train$diet),"diet_strict"] <- 0

What if we want to make use of eduction levels?

We may need to convert education to a numeric scale

levels(train$education)
##  [1] "college/university"               
##  [2] "dropped out of college/university"
##  [3] "dropped out of high school"       
##  [4] "dropped out of law school"        
##  [5] "dropped out of masters program"   
##  [6] "dropped out of med school"        
##  [7] "dropped out of ph.d program"      
##  [8] "dropped out of space camp"        
##  [9] "dropped out of two-year college"  
## [10] "graduated from college/university"
## [11] "graduated from high school"       
## [12] "graduated from law school"        
## [13] "graduated from masters program"   
## [14] "graduated from med school"        
## [15] "graduated from ph.d program"      
## [16] "graduated from space camp"        
## [17] "graduated from two-year college"  
## [18] "high school"                      
## [19] "law school"                       
## [20] "masters program"                  
## [21] "med school"                       
## [22] "ph.d program"                     
## [23] "space camp"                       
## [24] "two-year college"                 
## [25] "working on college/university"    
## [26] "working on high school"           
## [27] "working on law school"            
## [28] "working on masters program"       
## [29] "working on med school"            
## [30] "working on ph.d program"          
## [31] "working on space camp"            
## [32] "working on two-year college"
train$ed.level <- 14
train[grepl("high school",train$education),"ed.level"] <- 12
train[grepl("two-year",train$education),"ed.level"] <- 14
train[grepl("college",train$education),"ed.level"] <- 16
train[grepl("masters",train$education),"ed.level"] <- 18
train[grepl("law school",train$education),"ed.level"] <- 19
train[grepl("med",train$education),"ed.level"] <- 20
train[grepl("ph.d",train$education),"ed.level"] <- 21
train[grepl("dropped",train$education),"ed.level"] <- 
  train[grepl("dropped",train$education),"ed.level"] -2 


summary(train$ed.level)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   16.00   16.00   16.22   18.00   21.00
hist(train$ed.level)

You may find that you want to recode other variables in similar way.