611 Highlights in R

611 Syllabus in R

Here’s the 611 schedule of topics from this year, and some code highlights from each week.

# a toy data set for us to work with
n <- 100
gender <- as.factor(c(rep("M", n/2), rep("F", n/2)))
IQ <- rnorm(n, mean=100, sd=15)
degree <- rep(c("HS", "BA", "MS", "PhD"), n/4)
height <- as.numeric(gender)-2 + rnorm(n, mean=5.5, sd=1)
RT1 <- rchisq(n, 4)
RT2 <- rchisq(n, 4)
DV <- 50 - 5*(as.numeric(gender)-1) + .1*IQ + rnorm(n)

df <- data.frame(awesome=DV, gender=gender, IQ=IQ, degree=degree, height=height, RT1=RT1, RT2=RT2)

Exploratory Data Analysis

head(df)

##    awesome gender        IQ degree   height       RT1      RT2
## 1 55.28120      M 112.14288     HS 4.646628 0.3633789 2.824327
## 2 54.09872      M  91.23829     BA 5.560339 2.2456835 1.692923
## 3 53.04562      M  83.92578     MS 3.446986 8.0926142 4.441702
## 4 55.60539      M  87.97319    PhD 4.846104 9.3437717 3.974989
## 5 57.23617      M 111.96855     HS 4.773782 3.6712809 2.625829
## 6 55.18866      M  92.76796     BA 6.822313 5.4973758 3.136531

View(df)

summary(df)

##     awesome      gender       IQ         degree       height     
##  Min.   :50.24   F:50   Min.   : 63.51   BA :25   Min.   :2.241  
##  1st Qu.:54.94   M:50   1st Qu.: 88.89   HS :25   1st Qu.:4.367  
##  Median :57.61          Median :100.17   MS :25   Median :4.910  
##  Mean   :57.35          Mean   : 98.94   PhD:25   Mean   :4.970  
##  3rd Qu.:59.78          3rd Qu.:109.40            3rd Qu.:5.738  
##  Max.   :62.47          Max.   :133.81            Max.   :7.160  
##       RT1               RT2         
##  Min.   : 0.1865   Min.   : 0.2513  
##  1st Qu.: 2.1216   1st Qu.: 1.8807  
##  Median : 3.8957   Median : 3.3573  
##  Mean   : 3.9809   Mean   : 3.6674  
##  3rd Qu.: 5.5078   3rd Qu.: 4.9681  
##  Max.   :12.1995   Max.   :13.7104

library(psych)
describe(df)

##         vars   n  mean    sd median trimmed   mad   min    max range  skew
## awesome    1 100 57.35  2.96  57.61   57.44  3.72 50.24  62.47 12.23 -0.23
## gender*    2 100   NaN    NA     NA     NaN    NA   Inf   -Inf  -Inf    NA
## IQ         3 100 98.94 14.73 100.17   99.16 15.35 63.51 133.81 70.30 -0.15
## degree*    4 100   NaN    NA     NA     NaN    NA   Inf   -Inf  -Inf    NA
## height     5 100  4.97  1.10   4.91    4.99  1.08  2.24   7.16  4.92 -0.15
## RT1        6 100  3.98  2.51   3.90    3.77  2.50  0.19  12.20 12.01  0.75
## RT2        7 100  3.67  2.33   3.36    3.43  2.31  0.25  13.71 13.46  1.16
##         kurtosis   se
## awesome    -0.94 0.30
## gender*       NA   NA
## IQ         -0.62 1.47
## degree*       NA   NA
## height     -0.48 0.11
## RT1         0.25 0.25
## RT2         2.12 0.23

describe(df[,c("IQ","awesome")])

##         vars   n  mean    sd median trimmed   mad   min    max range  skew
## IQ         1 100 98.94 14.73 100.17   99.16 15.35 63.51 133.81 70.30 -0.15
## awesome    2 100 57.35  2.96  57.61   57.44  3.72 50.24  62.47 12.23 -0.23
##         kurtosis   se
## IQ         -0.62 1.47
## awesome    -0.94 0.30

plot(df$awesome ~ df$IQ)

plot(df$awesome ~ df$gender)

plot(df$awesome ~ df$degree)

plot(df$height ~ df$gender)

hist(df$IQ)

hist(df$RT1)

cor(df$awesome, df$IQ)

## [1] 0.4685434

cor(df[,c(1,3,5:7)])

##             awesome          IQ      height         RT1         RT2
## awesome  1.00000000  0.46854341 -0.33868198 -0.10441679  0.01706919
## IQ       0.46854341  1.00000000 -0.02077314  0.04033320 -0.03292867
## height  -0.33868198 -0.02077314  1.00000000  0.06604399 -0.08854055
## RT1     -0.10441679  0.04033320  0.06604399  1.00000000  0.17752155
## RT2      0.01706919 -0.03292867 -0.08854055  0.17752155  1.00000000

var(df$awesome, df$IQ)

## [1] 20.45331

table(df[,c(2,4)])

##       degree
## gender BA HS MS PhD
##      F 12 12 13  13
##      M 13 13 12  12

library(ggplot2)

library(GGally)
ggpairs(df) # the big guns

Distributions

qqnorm(df$IQ)
qqline(df$IQ, col="red")

qqnorm(df$RT1)
qqline(df$RT1, col="red")

df$RT1_sqrt <- sqrt(df$RT1)
qqnorm(df$RT1_sqrt)
qqline(df$RT1_sqrt, col="red")

Hypothesis Testing & Estimation: T-tests

# one-sample
t.test(IQ, mu=100, data=df, conf.level=.95)

## 
##  One Sample t-test
## 
## data:  IQ
## t = -0.7218, df = 99, p-value = 0.4721
## alternative hypothesis: true mean is not equal to 100
## 95 percent confidence interval:
##   96.0150 101.8591
## sample estimates:
## mean of x 
##  98.93704

# IST
t.test(IQ ~ gender, var.equal=TRUE, data=df)

## 
##  Two Sample t-test
## 
## data:  IQ by gender
## t = -0.2773, df = 98, p-value = 0.7821
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.692785  5.051723
## sample estimates:
## mean in group F mean in group M 
##        98.52677        99.34731

t.test(x=df[gender=="F",3], y=df[gender=="M",3], var.equal=TRUE)

## 
##  Two Sample t-test
## 
## data:  df[gender == "F", 3] and df[gender == "M", 3]
## t = -0.2773, df = 98, p-value = 0.7821
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.692785  5.051723
## sample estimates:
## mean of x mean of y 
##  98.52677  99.34731

# DST
t.test(x=RT1, y=RT2, paired=TRUE, data=df)

## 
##  Paired t-test
## 
## data:  RT1 and RT2
## t = 1.0094, df = 99, p-value = 0.3153
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3028698  0.9300579
## sample estimates:
## mean of the differences 
##               0.3135941

One-Way ANOVAs

results <- aov(awesome ~ degree, data=df)
summary(aov(awesome ~ degree, data=df))

##             Df Sum Sq Mean Sq F value Pr(>F)
## degree       3   12.1   4.042   0.452  0.716
## Residuals   96  857.8   8.935

summary(results)

##             Df Sum Sq Mean Sq F value Pr(>F)
## degree       3   12.1   4.042   0.452  0.716
## Residuals   96  857.8   8.935

Contrasts

Contrasts are a mess in R, especially non-orthogonal contrasts, but you can do them. Some resources:
http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm http://sites.stat.psu.edu/~jls/stat512/lectures/ContrastsInR.pdf http://www.uvm.edu/~dhowell/StatPages/More_Stuff/R/ContrastsInR.html

Two-Way ANOVAs

aov(awesome ~ degree*gender, data=df)

## Call:
##    aov(formula = awesome ~ degree * gender, data = df)
## 
## Terms:
##                   degree   gender degree:gender Residuals
## Sum of Squares   12.1269 582.6513       19.7314  255.3908
## Deg. of Freedom        3        1             3        92
## 
## Residual standard error: 1.666129
## Estimated effects may be unbalanced

summary(aov(awesome ~ degree*gender, data=df))

##               Df Sum Sq Mean Sq F value Pr(>F)    
## degree         3   12.1     4.0   1.456 0.2317    
## gender         1  582.7   582.7 209.890 <2e-16 ***
## degree:gender  3   19.7     6.6   2.369 0.0757 .  
## Residuals     92  255.4     2.8                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# regression coefficients output
summary(lm(awesome ~ degree*gender, data=df))

## 
## Call:
## lm(formula = awesome ~ degree * gender, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0557 -0.8270 -0.0566  1.0326  4.0704 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       59.15745    0.48097 122.996  < 2e-16 ***
## degreeHS          -0.02985    0.68019  -0.044   0.9651    
## degreeMS           1.30585    0.66699   1.958   0.0533 .  
## degreePhD          1.09156    0.66699   1.637   0.1051    
## genderM           -3.85966    0.66699  -5.787 9.83e-08 ***
## degreeHS:genderM  -0.25042    0.94326  -0.265   0.7912    
## degreeMS:genderM  -1.45544    0.94326  -1.543   0.1263    
## degreePhD:genderM -2.18151    0.94326  -2.313   0.0230 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.666 on 92 degrees of freedom
## Multiple R-squared:  0.7064, Adjusted R-squared:  0.6841 
## F-statistic: 31.62 on 7 and 92 DF,  p-value: < 2.2e-16

Correlation and Regression

cor(df$awesome, df$IQ)

## [1] 0.4685434

summary(lm(awesome ~ IQ, data=df))

## 
## Call:
## lm(formula = awesome ~ IQ, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.567 -2.563  0.171  2.545  4.067 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 48.01950    1.79662   26.73  < 2e-16 ***
## IQ           0.09431    0.01796    5.25 8.82e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.632 on 98 degrees of freedom
## Multiple R-squared:  0.2195, Adjusted R-squared:  0.2116 
## F-statistic: 27.57 on 1 and 98 DF,  p-value: 8.822e-07

Analysis of Categorical Data

table(df[,c("gender","degree")])

##       degree
## gender BA HS MS PhD
##      F 12 12 13  13
##      M 13 13 12  12

# goodness of fit
chisq.test(table(df$degree)) # are degrees equally distributed?

## 
##  Chi-squared test for given probabilities
## 
## data:  table(df$degree)
## X-squared = 0, df = 3, p-value = 1

chisq.test(table(df$degree), p=c(.4, .3, .2, .1)) # is it 40% BS, 30% HS, 20% MS, and 10% PhD?

## 
##  Chi-squared test for given probabilities
## 
## data:  table(df$degree)
## X-squared = 30.2083, df = 3, p-value = 1.248e-06

# test of independence 
chisq.test(table(df[,c(2,4)]))

## 
##  Pearson's Chi-squared test
## 
## data:  table(df[, c(2, 4)])
## X-squared = 0.16, df = 3, p-value = 0.9838

# Get measures of association
library(vcd)
assocstats(table(df[,c(2,4)]))

##                      X^2 df P(> X^2)
## Likelihood Ratio 0.16004  3  0.98377
## Pearson          0.16000  3  0.98377
## 
## Phi-Coefficient   : 0.04 
## Contingency Coeff.: 0.04 
## Cramer's V        : 0.04

611 Highlights in R

Rose Maier

January 4, 2015

611 Syllabus in R

Exploratory Data Analysis

Distributions

Hypothesis Testing & Estimation: T-tests

One-Way ANOVAs

Contrasts

Two-Way ANOVAs

Correlation and Regression

Analysis of Categorical Data