.libPaths(c("/home/rstudioshared", "/home/rstudioshared/packages",
"/home/rstudioshared/shared_files/packages"))
profiles <- read.csv('/home/rstudioshared/shared_files/data/profiles_removed_ages.csv')
library(dplyr); library(ggplot2)
This .csv file has data from nearly 60,000 Ok Cupid profiles. I removed the essays (since that made the file enormous) so we are left with demographic data and short answers.
Half of these profiles (the test set) are missing ages. Your challenge is to predict their ages (or claimed ages) as accurately as possible based on the other answers in the profile. You will then submit your age estimates to me and I will act as Kaggle, awarding a class winner based on the lowest root mean square error (RMSE).
summary(profiles)
## X set ID age
## Min. : 1 test :29973 Min. : 1 Min. : 0.00
## 1st Qu.:14987 train:29973 1st Qu.:14987 1st Qu.: 0.00
## Median :29974 Median :29974 Median : 9.00
## Mean :29974 Mean :29974 Mean : 16.18
## 3rd Qu.:44960 3rd Qu.:44960 3rd Qu.: 30.00
## Max. :59946 Max. :59946 Max. :110.00
##
## body_type diet drinks
## average :14652 mostly anything :16585 desperately: 322
## fit :12711 anything : 6183 not at all : 3267
## athletic:11819 strictly anything: 5113 often : 5164
## thin : 4711 mostly vegetarian: 3444 rarely : 5957
## curvy : 3924 mostly other : 1007 socially :41780
## (Other) : 6833 (Other) : 3219 very often : 471
## NA's : 5296 NA's :24395 NA's : 2985
## drugs education
## never :37724 graduated from college/university:23959
## often : 410 graduated from masters program : 8961
## sometimes: 7732 working on college/university : 5712
## NA's :14080 working on masters program : 1683
## graduated from two-year college : 1531
## (Other) :11472
## NA's : 6628
## ethnicity height income
## white :32828 Min. : 1.0 Min. : 20000
## asian : 6134 1st Qu.:66.0 1st Qu.: 20000
## hispanic / latin: 2823 Median :68.0 Median : 50000
## black : 2008 Mean :68.3 Mean : 104401
## other : 1706 3rd Qu.:71.0 3rd Qu.: 100000
## (Other) : 8764 Max. :95.0 Max. :1000000
## NA's : 5683 NA's :6 NA's :48443
## job last_online
## : 8201 2012-06-30:13753
## other : 7588 2012-06-29: 8440
## student : 4882 2012-06-28: 3981
## science / tech / engineering : 4848 2012-06-27: 2743
## computer / hardware / software: 4709 2012-07-01: 2090
## artistic / musical / writer : 4438 (Other) :28936
## (Other) :25280 NA's : 3
## location
## san francisco, california:31064
## oakland, california : 7214
## berkeley, california : 4210
## san mateo, california : 1331
## palo alto, california : 1064
## alameda, california : 910
## (Other) :14153
## offspring orientation
## doesn't have kids : 7559 bisexual: 2767
## doesn't have kids, but might want them: 3875 gay : 5573
## doesn't have kids, but wants them : 3565 straight:51603
## doesn't want kids : 2926 NA's : 3
## has kids : 1883
## (Other) : 4575
## NA's :35563
## pets
## likes dogs and likes cats:14813
## likes dogs : 7224
## likes dogs and has cats : 4313
## has dogs : 4133
## has dogs and likes cats : 2333
## (Other) : 7207
## NA's :19923
## religion sex
## agnosticism : 2723 f :24117
## other : 2691 m :35826
## agnosticism but not too serious about it: 2636 NA's: 3
## agnosticism and laughing about it : 2496
## catholicism but not too serious about it: 2318
## (Other) :26853
## NA's :20229
## sign smokes
## gemini and it's fun to think about : 1782 no :43893
## scorpio and it's fun to think about: 1772 sometimes : 3787
## leo and it's fun to think about : 1692 trying to quit: 1480
## libra and it's fun to think about : 1648 when drinking : 3040
## taurus and it's fun to think about : 1640 yes : 2231
## (Other) :40353 NA's : 5515
## NA's :11059
## speaks status
## english :21828 available : 1864
## english (fluently) : 6627 married : 310
## english (fluently), spanish (poorly) : 2059 seeing someone: 2064
## english (fluently), spanish (okay) : 1917 single :55695
## english (fluently), spanish (fluently): 1288 unknown : 10
## english, spanish : 859 NA's : 3
## (Other) :25368
Like most real data, this data set is a bit messy (in fact, relatively speaking this is quite neat). There are lots missing data points - folks who didn’t enter their income for instance, and many of the variables are not coded in the ideal way for a regression model.
train <- filter(profiles, set=="train")
test <- filter(profiles, set=="test")
m <- lm(age~income, data=train)
summary(m)
##
## Call:
## lm(formula = age ~ income, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.658 -7.458 -2.652 5.342 36.354
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.266e+01 1.473e-01 221.778 <2e-16 ***
## income -2.045e-07 6.552e-07 -0.312 0.755
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.89 on 5728 degrees of freedom
## (24243 observations deleted due to missingness)
## Multiple R-squared: 1.7e-05, Adjusted R-squared: -0.0001576
## F-statistic: 0.09739 on 1 and 5728 DF, p-value: 0.755
At first blush, income does not appear to be a significant predictor of age but that’s simply because we are utilizing it in a poor way. Instead, let’s try to predict age using the log of income (I’ll use log base 10 for easier interpreation):
m <- lm(age~log(income, base=10), data=train)
summary(m)
##
## Call:
## lm(formula = age ~ log(income, base = 10), data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.343 -6.496 -2.496 4.592 38.713
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.3620 1.5122 4.868 1.16e-06 ***
## log(income, base = 10) 5.3301 0.3177 16.776 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.656 on 5728 degrees of freedom
## (24243 observations deleted due to missingness)
## Multiple R-squared: 0.04683, Adjusted R-squared: 0.04667
## F-statistic: 281.4 on 1 and 5728 DF, p-value: < 2.2e-16
Now income is easily statistically significant. What age does this predict for someone who reports $10,000 of income? What about for someone with $1,000,000 of income? What about for someone with $1 of income or $0 of income? Do these values seem reasonable?
Note that, using the log of income is far from our only choice. We could get creative:
m <- lm(age~I(1/(income+10000)), data=train)
summary(m)
##
## Call:
## lm(formula = age ~ I(1/(income + 10000)), data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.072 -6.199 -2.554 4.041 40.982
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.838e+01 2.519e-01 152.38 <2e-16 ***
## I(1/(income + 10000)) -3.108e+05 1.189e+04 -26.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.348 on 5728 degrees of freedom
## (24243 observations deleted due to missingness)
## Multiple R-squared: 0.1066, Adjusted R-squared: 0.1064
## F-statistic: 683.5 on 1 and 5728 DF, p-value: < 2.2e-16
Which of these two models has lower error in sample?
One problem with either model is that most of its predictions with be NA because most OkCupid users (roughly 48 thousand of the total 60 thousand users) did not income their incomes. Here are the first 40 predictions:
predict(m, train)[1:40]
## 1 2 3 4 5 6 7 8
## NA NA NA NA NA NA NA 33.19900
## 9 10 11 12 13 14 15 16
## NA NA NA NA NA NA NA NA
## 17 18 19 20 21 22 23 24
## NA NA NA NA NA NA NA NA
## 25 26 27 28 29 30 31 32
## 38.07191 NA NA NA NA NA NA NA
## 33 34 35 36 37 38 39 40
## NA 28.01834 NA NA 36.43692 NA NA NA
How do we work around this?
train$listsincome[!is.na(train$income)] <- 1
train$listsincome[is.na(train$income)] <- 0
train$logincome <- ifelse(train$listsincome, log(train$income, base=10),0)
m <- lm(age~logincome+listsincome, data=train)
summary(m)
##
## Call:
## lm(formula = age ~ logincome + listsincome, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.343 -6.291 -2.291 4.049 77.709
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.29101 0.06073 531.75 <2e-16 ***
## logincome 5.33014 0.31112 17.13 <2e-16 ***
## listsincome -24.92898 1.48206 -16.82 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.455 on 29970 degrees of freedom
## Multiple R-squared: 0.009906, Adjusted R-squared: 0.00984
## F-statistic: 149.9 on 2 and 29970 DF, p-value: < 2.2e-16
hist(predict(m, train))
hist(train$age)
levels(train$orientation)
## [1] "bisexual" "gay" "straight"
m <- lm(age~logincome+listsincome+orientation, data=train)
summary(m)
##
## Call:
## lm(formula = age ~ logincome + listsincome + orientation, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.314 -6.416 -2.416 4.563 77.584
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.4707 0.2566 114.872 <2e-16 ***
## logincome 5.1727 0.3109 16.640 <2e-16 ***
## listsincome -24.1376 1.4808 -16.300 <2e-16 ***
## orientationgay 2.9571 0.3112 9.502 <2e-16 ***
## orientationstraight 2.9449 0.2612 11.274 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.434 on 29966 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.01412, Adjusted R-squared: 0.01399
## F-statistic: 107.3 on 4 and 29966 DF, p-value: < 2.2e-16
levels(profiles$sign)
## [1] "aquarius"
## [2] "aquarius and it matters a lot"
## [3] "aquarius and it's fun to think about"
## [4] "aquarius but it doesn't matter"
## [5] "aries"
## [6] "aries and it matters a lot"
## [7] "aries and it's fun to think about"
## [8] "aries but it doesn't matter"
## [9] "cancer"
## [10] "cancer and it matters a lot"
## [11] "cancer and it's fun to think about"
## [12] "cancer but it doesn't matter"
## [13] "capricorn"
## [14] "capricorn and it matters a lot"
## [15] "capricorn and it's fun to think about"
## [16] "capricorn but it doesn't matter"
## [17] "gemini"
## [18] "gemini and it matters a lot"
## [19] "gemini and it's fun to think about"
## [20] "gemini but it doesn't matter"
## [21] "leo"
## [22] "leo and it matters a lot"
## [23] "leo and it's fun to think about"
## [24] "leo but it doesn't matter"
## [25] "libra"
## [26] "libra and it matters a lot"
## [27] "libra and it's fun to think about"
## [28] "libra but it doesn't matter"
## [29] "pisces"
## [30] "pisces and it matters a lot"
## [31] "pisces and it's fun to think about"
## [32] "pisces but it doesn't matter"
## [33] "sagittarius"
## [34] "sagittarius and it matters a lot"
## [35] "sagittarius and it's fun to think about"
## [36] "sagittarius but it doesn't matter"
## [37] "scorpio"
## [38] "scorpio and it matters a lot"
## [39] "scorpio and it's fun to think about"
## [40] "scorpio but it doesn't matter"
## [41] "taurus"
## [42] "taurus and it matters a lot"
## [43] "taurus and it's fun to think about"
## [44] "taurus but it doesn't matter"
## [45] "virgo"
## [46] "virgo and it matters a lot"
## [47] "virgo and it's fun to think about"
## [48] "virgo but it doesn't matter"
train$sign[1:20]
## [1] gemini pisces but it doesn't matter
## [3] aquarius virgo
## [5] gemini but it doesn't matter taurus
## [7] taurus taurus
## [9] pisces but it doesn't matter libra but it doesn't matter
## [11] libra sagittarius but it doesn't matter
## [13] scorpio and it matters a lot leo and it's fun to think about
## [15] <NA> gemini
## [17] libra sagittarius but it doesn't matter
## [19] cancer and it's fun to think about libra
## 48 Levels: aquarius ... virgo but it doesn't matter
grepl("fun",train$sign)[1:20]
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
train$sign_fun <- 0
train[grepl("fun",train$sign),"sign_fun"] <- 1
train[grepl("doesn't matter",train$sign),"sign_fun"] <- -1
m <- lm(age~logincome+listsincome+orientation+sign_fun, data=train)
summary(m)
##
## Call:
## lm(formula = age ~ logincome + listsincome + orientation + sign_fun,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.617 -6.554 -2.409 4.319 77.591
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.4221 0.2568 114.571 < 2e-16 ***
## logincome 5.1985 0.3109 16.723 < 2e-16 ***
## listsincome -24.2551 1.4808 -16.380 < 2e-16 ***
## orientationgay 2.9606 0.3111 9.515 < 2e-16 ***
## orientationstraight 2.9872 0.2614 11.429 < 2e-16 ***
## sign_fun 0.2720 0.0706 3.853 0.000117 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.432 on 29965 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.01461, Adjusted R-squared: 0.01444
## F-statistic: 88.83 on 5 and 29965 DF, p-value: < 2.2e-16
A Little Cleanup?
train$height[train$height < 50] <- NA
train[grepl("student",train$job),"isstudent"] <- 1
train[!grepl("student",train$job),"isstudent"] <- 0
train[grepl("strictly",train$diet),"diet_strict"] <- 1
train[!grepl("strictly",train$diet),"diet_strict"] <- 0
We may need to convert education to a numeric scale
levels(train$education)
## [1] "college/university"
## [2] "dropped out of college/university"
## [3] "dropped out of high school"
## [4] "dropped out of law school"
## [5] "dropped out of masters program"
## [6] "dropped out of med school"
## [7] "dropped out of ph.d program"
## [8] "dropped out of space camp"
## [9] "dropped out of two-year college"
## [10] "graduated from college/university"
## [11] "graduated from high school"
## [12] "graduated from law school"
## [13] "graduated from masters program"
## [14] "graduated from med school"
## [15] "graduated from ph.d program"
## [16] "graduated from space camp"
## [17] "graduated from two-year college"
## [18] "high school"
## [19] "law school"
## [20] "masters program"
## [21] "med school"
## [22] "ph.d program"
## [23] "space camp"
## [24] "two-year college"
## [25] "working on college/university"
## [26] "working on high school"
## [27] "working on law school"
## [28] "working on masters program"
## [29] "working on med school"
## [30] "working on ph.d program"
## [31] "working on space camp"
## [32] "working on two-year college"
train$ed.level <- 14
train[grepl("high school",train$education),"ed.level"] <- 12
train[grepl("two-year",train$education),"ed.level"] <- 14
train[grepl("college",train$education),"ed.level"] <- 16
train[grepl("masters",train$education),"ed.level"] <- 18
train[grepl("law school",train$education),"ed.level"] <- 19
train[grepl("med",train$education),"ed.level"] <- 20
train[grepl("ph.d",train$education),"ed.level"] <- 21
train[grepl("dropped",train$education),"ed.level"] <-
train[grepl("dropped",train$education),"ed.level"] -2
summary(train$ed.level)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.00 16.00 16.00 16.22 18.00 21.00
hist(train$ed.level)
You may find that you want to recode other variables in similar way.