Lab 7: The Curse of Beauty

Here we use a data set of student evaluations which includes a ranking of how good-looking the professor is.

load(url("http://s3.amazonaws.com/assets.datacamp.com/course/dasi/evals.RData"))
head(evals)
##   score         rank    ethnicity gender language age cls_perc_eval
## 1   4.7 tenure track     minority female  english  36         55.81
## 2   4.1 tenure track     minority female  english  36         68.80
## 3   3.9 tenure track     minority female  english  36         60.80
## 4   4.8 tenure track     minority female  english  36         62.60
## 5   4.6      tenured not minority   male  english  59         85.00
## 6   4.3      tenured not minority   male  english  59         87.50
##   cls_did_eval cls_students cls_level cls_profs  cls_credits bty_f1lower
## 1           24           43     upper    single multi credit           5
## 2           86          125     upper    single multi credit           5
## 3           76          125     upper    single multi credit           5
## 4           77          123     upper    single multi credit           5
## 5           17           20     upper  multiple multi credit           4
## 6           35           40     upper  multiple multi credit           4
##   bty_f1upper bty_f2upper bty_m1lower bty_m1upper bty_m2upper bty_avg
## 1           7           6           2           4           6       5
## 2           7           6           2           4           6       5
## 3           7           6           2           4           6       5
## 4           7           6           2           4           6       5
## 5           4           2           2           3           3       3
## 6           4           2           2           3           3       3
##   pic_outfit pic_color
## 1 not formal     color
## 2 not formal     color
## 3 not formal     color
## 4 not formal     color
## 5 not formal     color
## 6 not formal     color
names(evals)
##  [1] "score"         "rank"          "ethnicity"     "gender"       
##  [5] "language"      "age"           "cls_perc_eval" "cls_did_eval" 
##  [9] "cls_students"  "cls_level"     "cls_profs"     "cls_credits"  
## [13] "bty_f1lower"   "bty_f1upper"   "bty_f2upper"   "bty_m1lower"  
## [17] "bty_m1upper"   "bty_m2upper"   "bty_avg"       "pic_outfit"   
## [21] "pic_color"

Question 3
This question requires summary statistics on the score variable

summary(evals$score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.30    3.80    4.30    4.17    4.60    5.00
hist(evals$score)

plot of chunk unnamed-chunk-2

Visualizing Relationships

Now we use some of R's data visualization techniques.

# Create a scatterplot for 'age' vs 'bty_avg':
plot(evals$age, evals$bty_avg)

plot of chunk unnamed-chunk-3

# Create a boxplot for 'age' and 'gender':
boxplot(evals$age ~ evals$gender)

plot of chunk unnamed-chunk-3

# Create a mosaic plot for 'rank' and 'gender':
mosaicplot(evals$rank ~ evals$gender)

plot of chunk unnamed-chunk-3

Simple Linear Regression

# Create a scatterplot for 'score' and 'bty_avg'
plot(evals$score ~ evals$bty_avg)

plot of chunk unnamed-chunk-4

Note the overplotting.

The Jitter Function

plot(evals$score ~ jitter(evals$bty_avg))

plot of chunk unnamed-chunk-5

More than natural variation?

plot(evals$score ~ jitter(evals$bty_avg))

plot of chunk unnamed-chunk-6


# Construct the linear model:
m_bty <- lm(evals$score ~ evals$bty_avg)

# Plot your linear model on your plot:
plot(m_bty)

plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6

abline(m_bty)

plot of chunk unnamed-chunk-6

This actually gives you a lot of diagnostic plots as well.

Question 4

summary(m_bty)
## 
## Call:
## lm(formula = evals$score ~ evals$bty_avg)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.925 -0.369  0.142  0.398  0.931 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.8803     0.0761   50.96  < 2e-16 ***
## evals$bty_avg   0.0666     0.0163    4.09  5.1e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.535 on 461 degrees of freedom
## Multiple R-squared:  0.035,  Adjusted R-squared:  0.0329 
## F-statistic: 16.7 on 1 and 461 DF,  p-value: 5.08e-05

Just because something is statistically significant does not mean it is of any practical significance.

Question 5

The plots show us that the residuals are skewed to the right but are nearly normal. With the large sample size this violation of the assumptions of regression may not be critical.

Multiple Linear Regression

plot(evals$bty_f1lower, evals$bty_avg)

plot of chunk unnamed-chunk-8

cor(evals$bty_f1lower, evals$bty_avg)
## [1] 0.8439

The relationship between all beauty variables

The relationship is actually quite strong. We can actually take a look at the relationships between all beauty variables (columns 13 through 19) by passing those columns of the data set to the plot() command simultaneously.

plot(evals[, 13:19])

plot of chunk unnamed-chunk-9

Given how similar the variables are, that is, their high degree of correlation, it should be safe to use bty_avg as a proxy for all of them.

Taking into account gender

Now we add a control variable. We ask is the effect different for men and women?

m_bty_gen = lm(score ~ bty_avg + gender, data = evals)
summary(m_bty_gen)
## 
## Call:
## lm(formula = score ~ bty_avg + gender, data = evals)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.831 -0.362  0.105  0.421  0.931 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.7473     0.0847   44.27  < 2e-16 ***
## bty_avg       0.0742     0.0163    4.56  6.5e-06 ***
## gendermale    0.1724     0.0502    3.43  0.00065 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.529 on 460 degrees of freedom
## Multiple R-squared:  0.0591, Adjusted R-squared:  0.055 
## F-statistic: 14.5 on 2 and 460 DF,  p-value: 8.18e-07

Gender Male

We use their custom function to plot the regression with a dummy variable.

multiLines(m_bty_gen)

plot of chunk unnamed-chunk-11

Cool! I would like to look at the code.

Switching Rank and Gender

m_bty_rank = lm(score ~ bty_avg + rank, data = evals)
summary(m_bty_rank)
## 
## Call:
## lm(formula = score ~ bty_avg + rank, data = evals)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.871 -0.364  0.149  0.410  0.953 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        3.9815     0.0908   43.86  < 2e-16 ***
## bty_avg            0.0678     0.0165    4.10  4.9e-05 ***
## ranktenure track  -0.1607     0.0740   -2.17    0.030 *  
## ranktenured       -0.1262     0.0627   -2.01    0.045 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.533 on 459 degrees of freedom
## Multiple R-squared:  0.0465, Adjusted R-squared:  0.0403 
## F-statistic: 7.46 on 3 and 459 DF,  p-value: 6.88e-05

Search for the best model

m_full = lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + 
    cls_students + cls_level + cls_profs + cls_credits + bty_avg, data = evals)
summary(m_full)
## 
## Call:
## lm(formula = score ~ rank + ethnicity + gender + language + age + 
##     cls_perc_eval + cls_students + cls_level + cls_profs + cls_credits + 
##     bty_avg, data = evals)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8448 -0.3137  0.0856  0.3573  1.1010 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            3.530504   0.240820   14.66  < 2e-16 ***
## ranktenure track      -0.107012   0.082025   -1.30  0.19269    
## ranktenured           -0.045037   0.065218   -0.69  0.49020    
## ethnicitynot minority  0.186965   0.077533    2.41  0.01629 *  
## gendermale             0.178617   0.051535    3.47  0.00058 ***
## languagenon-english   -0.126825   0.108036   -1.17  0.24105    
## age                   -0.006650   0.003083   -2.16  0.03154 *  
## cls_perc_eval          0.005700   0.001551    3.67  0.00027 ***
## cls_students           0.000445   0.000358    1.24  0.21460    
## cls_levelupper         0.018710   0.055583    0.34  0.73656    
## cls_profssingle       -0.008575   0.051353   -0.17  0.86746    
## cls_creditsone credit  0.508743   0.117013    4.35  1.7e-05 ***
## bty_avg                0.061265   0.016675    3.67  0.00027 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.504 on 450 degrees of freedom
## Multiple R-squared:  0.164,  Adjusted R-squared:  0.141 
## F-statistic: 7.33 on 12 and 450 DF,  p-value: 2.41e-12

Eliminating variables from the model: p-value selection

m_new = lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + 
    cls_students + cls_level + cls_credits + bty_avg, data = evals)
summary(m_new)
## 
## Call:
## lm(formula = score ~ rank + ethnicity + gender + language + age + 
##     cls_perc_eval + cls_students + cls_level + cls_credits + 
##     bty_avg, data = evals)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8505 -0.3139  0.0805  0.3596  1.1036 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            3.528630   0.240299   14.68  < 2e-16 ***
## ranktenure track      -0.107364   0.081910   -1.31  0.19061    
## ranktenured           -0.045374   0.065117   -0.70  0.48628    
## ethnicitynot minority  0.189372   0.076099    2.49  0.01319 *  
## gendermale             0.178027   0.051358    3.47  0.00058 ***
## languagenon-english   -0.126574   0.107909   -1.17  0.24143    
## age                   -0.006662   0.003079   -2.16  0.03101 *  
## cls_perc_eval          0.005679   0.001545    3.68  0.00027 ***
## cls_students           0.000449   0.000357    1.26  0.20932    
## cls_levelupper         0.018374   0.055487    0.33  0.74069    
## cls_creditsone credit  0.510916   0.116161    4.40  1.4e-05 ***
## bty_avg                0.061150   0.016643    3.67  0.00027 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.503 on 451 degrees of freedom
## Multiple R-squared:  0.163,  Adjusted R-squared:  0.143 
## F-statistic: 8.01 on 11 and 451 DF,  p-value: 8.3e-13

Eliminate all one by one

Eliminate each variable from the model and see which version yeilds the highest adjusted R-Squared.

m_full = lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + 
    cls_students + cls_level + cls_profs + cls_credits + bty_avg, data = evals)
summary(m_full)$adj.r.squared
## [1] 0.1412

## remove rank
m_nRank = lm(score ~ ethnicity + gender + language + age + cls_perc_eval + cls_students + 
    cls_level + cls_profs + cls_credits + bty_avg, data = evals)
summary(m_nRank)$adj.r.squared
## [1] 0.1418

m_eth = lm(score ~ rank + gender + language + age + cls_perc_eval + cls_students + 
    cls_level + cls_profs + cls_credits + bty_avg, data = evals)
summary(m_eth)$adj.r.squared
## [1] 0.132

m_gender = lm(score ~ rank + ethnicity + language + age + cls_perc_eval + cls_students + 
    cls_level + cls_profs + cls_credits + bty_avg, data = evals)
summary(m_gender)$adj.r.squared
## [1] 0.1202

m_language = lm(score ~ rank + ethnicity + gender + age + cls_perc_eval + cls_students + 
    cls_level + cls_profs + cls_credits + bty_avg, data = evals)
summary(m_language)$adj.r.squared
## [1] 0.1405


m_age = lm(score ~ rank + ethnicity + gender + language + cls_perc_eval + cls_students + 
    cls_level + cls_profs + cls_credits + bty_avg, data = evals)
summary(m_age)$adj.r.squared
## [1] 0.1343


m_cls_perc_eval = lm(score ~ rank + ethnicity + gender + language + age + cls_students + 
    cls_level + cls_profs + cls_credits + bty_avg, data = evals)
summary(m_cls_perc_eval)$adj.r.squared
## [1] 0.1174


m_cls_students = lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + 
    cls_level + cls_profs + cls_credits + bty_avg, data = evals)
summary(m_cls_students)$adj.r.squared
## [1] 0.1402


m_cls_level = lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + 
    cls_students + cls_profs + cls_credits + bty_avg, data = evals)
summary(m_cls_level)$adj.r.squared
## [1] 0.1429


m_cls_profs = lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + 
    cls_students + cls_level + cls_credits + bty_avg, data = evals)
summary(m_cls_profs)$adj.r.squared
## [1] 0.1431


m_cls_credits = lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + 
    cls_students + cls_level + cls_profs + bty_avg, data = evals)
summary(m_cls_credits)$adj.r.squared
## [1] 0.1071


m_bty_avg = lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + 
    cls_students + cls_level + cls_profs + cls_credits, data = evals)
summary(m_bty_avg)$adj.r.squared
## [1] 0.1174

We find that the removal of the variable cls_profs (whether the professor in the class is single) increases the adjusted R-squared the most.