Data input example

For the first example, we read data from a remote file server for several hundred subjects on 13 personality scales (5 from the Eysenck Personality Inventory (EPI), 5 from a Big Five Inventory (BFI), the Beck Depression Inventory, and two anxiety scales). The file is structured normally, i.e. rows represent different subjects, columns different variables, and the first row gives subject labels. (To read from a local file, we simply change the name of the datafilename.)

#specify the name and address of the remote file
datafilename <- "http://personality-project.org/r/datasets/maps.mixx.epi.bfi.data"
#datafilename <- "Desktop/epi.big5.txt"  #read from local directory  or
# datafilename <- file.choose()     # use the OS to find the file 
#in all cases
person.data  <- read.table(datafilename,header=TRUE)  #read the data file

#Alternatively, to read in a comma delimited file:
#person.data  <- read.table(datafilename,header=TRUE,sep=",")  
names(person.data)  #list the names of the variables
##  [1] "epiE"     "epiS"     "epiImp"   "epilie"   "epiNeur"  "bfagree" 
##  [7] "bfcon"    "bfext"    "bfneur"   "bfopen"   "bdi"      "traitanx"
## [13] "stateanx"

The data are now in the data.table “person.data”. Tables allow one to have columns that are either numeric or alphanumeric. To address a particular row (e.g., subject = 5) and column (variable = 7) you simplely enter the required fields:

person.data[5,8]         #the 5th subject, 8th variable  or
## [1] 103
person.data[5,"bfext"]   #5th subject,  "Big Five Inventory - Extraversion" variable
## [1] 103
 #or  
person.data[5,]         #To list all the data for a particular subject (e.g, the 5th subject) 
##   epiE epiS epiImp epilie epiNeur bfagree bfcon bfext bfneur bfopen bdi
## 5   14    6      5      3       2     115   102   103     86    118   8
##   traitanx stateanx
## 5       39       67
person.data[5:10,]      #list cases 5 - 10
##    epiE epiS epiImp epilie epiNeur bfagree bfcon bfext bfneur bfopen bdi
## 5    14    6      5      3       2     115   102   103     86    118   8
## 6     6    4      2      5      15     110   113    61     54    149   5
## 7    15    9      4      3      12     109    58    99     55    110   7
## 8    18    9      7      2      10      92    57    94     72    114   0
## 9    15   11      3      3       1     127   108   108     35     86   0
## 10    8    5      2      2      10      74   100    61     87     89   7
##    traitanx stateanx
## 5        39       67
## 6        51       38
## 7        40       32
## 8        32       41
## 9        22       26
## 10       35       31
person.data[5:10,"bfext"]  #list  just the extraversion scores for subjects 5-10
## [1] 103  61  99  94 108  61
person.data[5:10,4:8]      #list the 4th through 8th variables for subjects 5 - 10.
##    epilie epiNeur bfagree bfcon bfext
## 5       3       2     115   102   103
## 6       5      15     110   113    61
## 7       3      12     109    58    99
## 8       2      10      92    57    94
## 9       3       1     127   108   108
## 10      2      10      74   100    61

In order to select a particular subset of the data, use the subset function. The next example uses subset to display cases where the lie scale was pretty high

subset(person.data,epilie>6)    #print the data meeting the logical condition epilie>6
##     epiE epiS epiImp epilie epiNeur bfagree bfcon bfext bfneur bfopen bdi
## 16    11    5      6      7      13     126    78   112     83    132   4
## 212    6    4      1      7       4     147   119   102     81    142   2
##     traitanx stateanx
## 16        45       28
## 212       26       21
#also try

One can also selectively display particular columns meeting particular criteria, or selectively extract variables from a dataframe.

subset(person.data[,2:5],epilie>5 & epiNeur<3) #notice that we are taking the logical 'and'  
##     epiS epiImp epilie epiNeur
## 12    12      3      6       1
## 118    8      2      6       2
subset(person.data[,3:7], epilie>6 | epiNeur ==2 ) #do a logical 'or' of the two conditions
##     epiImp epilie epiNeur bfagree bfcon
## 5        5      3       2     115   102
## 13       4      5       2     133   114
## 16       6      7      13     126    78
## 27       6      3       2     137   131
## 43       5      5       2     135   138
## 68       7      3       2     135   114
## 118      2      6       2     136    97
## 125      7      4       2     131    89
## 139      4      2       2     112   115
## 151      4      5       2     100   105
## 212      1      7       4     147   119
## 218      3      3       2      99   137

Correlations

library(psych)
datafilename <- "http://personality-project.org/R/datasets/maps.mixx.epi.bfi.data" 
person.data <-  read.table(datafilename,header=TRUE)  #read the data file
names(person.data)  #list the names of the variables
##  [1] "epiE"     "epiS"     "epiImp"   "epilie"   "epiNeur"  "bfagree" 
##  [7] "bfcon"    "bfext"    "bfneur"   "bfopen"   "bdi"      "traitanx"
## [13] "stateanx"
attach(person.data)      #make the separate variable available -- always do detach when finished. 
             #The with construct is better.

epi <-  cbind(epiE,epiS,epiImp,epilie,epiNeur)       #form a new variable "epi" 
epi.df <- data.frame(epi)      #actually, more useful to treat this variable as a data frame 

bfi.df <- data.frame(cbind(bfext,bfneur,bfagree,bfcon,bfopen))    
    #create bfi as a data frame as well
    
detach(person.data)   #  very important to detach after an attach
describe(epi.df)  
##         vars   n  mean   sd median trimmed  mad min max range  skew
## epiE       1 231 13.33 4.14     14   13.49 4.45   1  22    21 -0.33
## epiS       2 231  7.58 2.69      8    7.77 2.97   0  13    13 -0.57
## epiImp     3 231  4.37 1.88      4    4.36 1.48   0   9     9  0.06
## epilie     4 231  2.38 1.50      2    2.27 1.48   0   7     7  0.66
## epiNeur    5 231 10.41 4.90     10   10.39 4.45   0  23    23  0.06
##         kurtosis   se
## epiE       -0.06 0.27
## epiS       -0.02 0.18
## epiImp     -0.62 0.12
## epilie      0.24 0.10
## epiNeur    -0.50 0.32
round(cor(epi.df),2)
##          epiE  epiS epiImp epilie epiNeur
## epiE     1.00  0.85   0.80  -0.22   -0.18
## epiS     0.85  1.00   0.43  -0.05   -0.22
## epiImp   0.80  0.43   1.00  -0.24   -0.07
## epilie  -0.22 -0.05  -0.24   1.00   -0.25
## epiNeur -0.18 -0.22  -0.07  -0.25    1.00
round(cor(epi.df,bfi.df),2)
##         bfext bfneur bfagree bfcon bfopen
## epiE     0.54  -0.09    0.18 -0.11   0.14
## epiS     0.58  -0.07    0.20  0.05   0.15
## epiImp   0.35  -0.09    0.08 -0.24   0.07
## epilie  -0.04  -0.22    0.17  0.23  -0.03
## epiNeur -0.17   0.63   -0.08 -0.13   0.09

Testing the significance of a set of correlations

The cor function does not report the probability of the correlation. The cor.test in Core R will test the significance of a single correlation. For those who are more accustomed to testing many correlations, corr.test in psych will report the raw correlations, the pairwise number of observations, and the p-value of the correlation, either for a single correlation or corrected for multiple tests using the Holm correction.

corr.test(sat.act)
## Call:corr.test(x = sat.act)
## Correlation matrix 
##           gender education   age   ACT  SATV  SATQ
## gender      1.00      0.09 -0.02 -0.04 -0.02 -0.17
## education   0.09      1.00  0.55  0.15  0.05  0.03
## age        -0.02      0.55  1.00  0.11 -0.04 -0.03
## ACT        -0.04      0.15  0.11  1.00  0.56  0.59
## SATV       -0.02      0.05 -0.04  0.56  1.00  0.64
## SATQ       -0.17      0.03 -0.03  0.59  0.64  1.00
## Sample Size 
##           gender education age ACT SATV SATQ
## gender       700       700 700 700  700  687
## education    700       700 700 700  700  687
## age          700       700 700 700  700  687
## ACT          700       700 700 700  700  687
## SATV         700       700 700 700  700  687
## SATQ         687       687 687 687  687  687
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##           gender education  age  ACT SATV SATQ
## gender      0.00      0.17 1.00 1.00    1    0
## education   0.02      0.00 0.00 0.00    1    1
## age         0.58      0.00 0.00 0.03    1    1
## ACT         0.33      0.00 0.00 0.00    0    0
## SATV        0.62      0.22 0.26 0.00    0    0
## SATQ        0.00      0.36 0.37 0.00    0    0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
corr.test(epi.df)
## Call:corr.test(x = epi.df)
## Correlation matrix 
##          epiE  epiS epiImp epilie epiNeur
## epiE     1.00  0.85   0.80  -0.22   -0.18
## epiS     0.85  1.00   0.43  -0.05   -0.22
## epiImp   0.80  0.43   1.00  -0.24   -0.07
## epilie  -0.22 -0.05  -0.24   1.00   -0.25
## epiNeur -0.18 -0.22  -0.07  -0.25    1.00
## Sample Size 
## [1] 231
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##         epiE epiS epiImp epilie epiNeur
## epiE    0.00 0.00   0.00   0.00    0.02
## epiS    0.00 0.00   0.00   0.53    0.00
## epiImp  0.00 0.00   0.00   0.00    0.53
## epilie  0.00 0.43   0.00   0.00    0.00
## epiNeur 0.01 0.00   0.26   0.00    0.00
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option

Inferential Statistics – Analysis of Variance

The following are examples of Analysis of Variance. A more complete listing and discussion of these examples including the output is in the R.anova page. This section just gives example instructions. See also the ANOVA section of Jonathan Baron’s page.

One Way ANOVA

#tell where the data come from
datafilename="http://personality-project.org/r/datasets/R.appendix1.data"
data.ex1=read.table(datafilename,header=T)   #read the data into a table

aov.ex1 = aov(Alertness~Dosage,data=data.ex1)  #do the analysis of variance
summary(aov.ex1)                                    #show the summary table
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## Dosage       2  426.2  213.12   8.789 0.00298 **
## Residuals   15  363.8   24.25                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(model.tables(aov.ex1,"means"),digits=3)
## Tables of means
## Grand mean
##          
## 27.66667 
## 
##  Dosage 
##        a    b    c
##     32.5 28.2 19.2
## rep  6.0  8.0  4.0
    #report the means and the number of subjects/cell
boxplot(Alertness~Dosage,data=data.ex1)

    #graphical summary appears in graphics window

Two Way (between subjects) Analysis of Variance (ANOVA)

The standardard 2-way ANOVA just adds another Independent Variable to the model. The example data set is stored at the personality-project.

datafilename="http://personality-project.org/R/datasets/R.appendix2.data"
data.ex2=read.table(datafilename,header=T)   #read the data into a table
data.ex2                                      #show the data
##    Observation Gender Dosage Alertness
## 1            1      m      a         8
## 2            2      m      a        12
## 3            3      m      a        13
## 4            4      m      a        12
## 5            5      m      b         6
## 6            6      m      b         7
## 7            7      m      b        23
## 8            8      m      b        14
## 9            9      f      a        15
## 10          10      f      a        12
## 11          11      f      a        22
## 12          12      f      a        14
## 13          13      f      b        15
## 14          14      f      b        12
## 15          15      f      b        18
## 16          16      f      b        22
aov.ex2 = aov(Alertness~Gender*Dosage,data=data.ex2)         #do the analysis of variance
summary(aov.ex2)                                    #show the summary table
##               Df Sum Sq Mean Sq F value Pr(>F)
## Gender         1  76.56   76.56   2.952  0.111
## Dosage         1   5.06    5.06   0.195  0.666
## Gender:Dosage  1   0.06    0.06   0.002  0.962
## Residuals     12 311.25   25.94
print(model.tables(aov.ex2,"means"),digits=3)      
## Tables of means
## Grand mean
##         
## 14.0625 
## 
##  Gender 
## Gender
##     f     m 
## 16.25 11.88 
## 
##  Dosage 
## Dosage
##     a     b 
## 13.50 14.62 
## 
##  Gender:Dosage 
##       Dosage
## Gender a     b    
##      f 15.75 16.75
##      m 11.25 12.50
    #report the means and the number of subjects/cell
boxplot(Alertness~Dosage*Gender,data=data.ex2) 

    #graphical summary of means of the 4 cells
attach(data.ex2)
interaction.plot(Dosage,Gender,Alertness)  #another way to graph the means 

detach(data.ex2)

One way repeated Measures

Repeated measures are some what more complicated. However, Jason French has prepared a very useful tutorial on using R for repeated measures.

datafilename="http://personality-project.org/r/datasets/R.appendix3.data"
data.ex3=read.table(datafilename,header=T)   #read the data into a table
data.ex3                                      #show the data
##    Observation Subject Valence Recall
## 1            1     Jim     Neg     32
## 2            2     Jim     Neu     15
## 3            3     Jim     Pos     45
## 4            4  Victor     Neg     30
## 5            5  Victor     Neu     13
## 6            6  Victor     Pos     40
## 7            7    Faye     Neg     26
## 8            8    Faye     Neu     12
## 9            9    Faye     Pos     42
## 10          10     Ron     Neg     22
## 11          11     Ron     Neu     10
## 12          12     Ron     Pos     38
## 13          13   Jason     Neg     29
## 14          14   Jason     Neu      8
## 15          15   Jason     Pos     35
aov.ex3 = aov(Recall~Valence+Error(Subject/Valence),data.ex3)
summary(aov.ex3)
## 
## Error: Subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  4  105.1   26.27               
## 
## Error: Subject:Valence
##           Df Sum Sq Mean Sq F value   Pr(>F)    
## Valence    2 2029.7  1014.9   189.1 1.84e-07 ***
## Residuals  8   42.9     5.4                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(model.tables(aov.ex3,"means"),digits=3)       
## Tables of means
## Grand mean
##          
## 26.46667 
## 
##  Valence 
## Valence
##  Neg  Neu  Pos 
## 27.8 11.6 40.0
    #report the means and the number of subjects/cell
boxplot(Recall~Valence,data=data.ex3)          #graphical output

Two way repeated measures

This gets even more complicated.

datafilename="http://personality-project.org/r/datasets/R.appendix4.data"
data.ex4=read.table(datafilename,header=T)   #read the data into a table
 data.ex4                                      #show the data
##    Observation Subject Task Valence Recall
## 1            1     Jim Free     Neg      8
## 2            2     Jim Free     Neu      9
## 3            3     Jim Free     Pos      5
## 4            4     Jim Cued     Neg      7
## 5            5     Jim Cued     Neu      9
## 6            6     Jim Cued     Pos     10
## 7            7  Victor Free     Neg     12
## 8            8  Victor Free     Neu     13
## 9            9  Victor Free     Pos     14
## 10          10  Victor Cued     Neg     16
## 11          11  Victor Cued     Neu     13
## 12          12  Victor Cued     Pos     14
## 13          13    Faye Free     Neg     13
## 14          14    Faye Free     Neu     13
## 15          15    Faye Free     Pos     12
## 16          16    Faye Cued     Neg     15
## 17          17    Faye Cued     Neu     16
## 18          18    Faye Cued     Pos     14
## 19          19     Ron Free     Neg     12
## 20          20     Ron Free     Neu     14
## 21          21     Ron Free     Pos     15
## 22          22     Ron Cued     Neg     17
## 23          23     Ron Cued     Neu     18
## 24          24     Ron Cued     Pos     20
## 25          25   Jason Free     Neg      6
## 26          26   Jason Free     Neu      7
## 27          27   Jason Free     Pos      9
## 28          28   Jason Cued     Neg      4
## 29          29   Jason Cued     Neu      9
## 30          30   Jason Cued     Pos     10
 aov.ex4=aov(Recall~(Task*Valence)+Error(Subject/(Task*Valence)),data.ex4 )

summary(aov.ex4)
## 
## Error: Subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  4  349.1   87.28               
## 
## Error: Subject:Task
##           Df Sum Sq Mean Sq F value Pr(>F)  
## Task       1  30.00  30.000   7.347 0.0535 .
## Residuals  4  16.33   4.083                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Error: Subject:Valence
##           Df Sum Sq Mean Sq F value Pr(>F)
## Valence    2   9.80   4.900   1.459  0.288
## Residuals  8  26.87   3.358               
## 
## Error: Subject:Task:Valence
##              Df Sum Sq Mean Sq F value Pr(>F)
## Task:Valence  2   1.40   0.700   0.291  0.755
## Residuals     8  19.27   2.408
print(model.tables(aov.ex4,"means"),digits=3)       
## Tables of means
## Grand mean
##      
## 11.8 
## 
##  Task 
## Task
## Cued Free 
## 12.8 10.8 
## 
##  Valence 
## Valence
##  Neg  Neu  Pos 
## 11.0 12.1 12.3 
## 
##  Task:Valence 
##       Valence
## Task   Neg  Neu  Pos 
##   Cued 11.8 13.0 13.6
##   Free 10.2 11.2 11.0
    #report the means and the number of subjects/cell
boxplot(Recall~Task*Valence,data=data.ex4) #graphical summary of means of the 6 cells

attach(data.ex4)
interaction.plot(Valence,Task,Recall)    #another way to graph the interaction

detach(data.ex4)

4 way anova: 2 repeated measures and two between subjects

datafilename="http://personality-project.org/r/datasets/R.appendix5.data"
data.ex5=read.table(datafilename,header=T)   #read the data into a table
#data.ex5                                      #show the data
aov.ex5 = 
aov(Recall~(Task*Valence*Gender*Dosage)+Error(Subject/(Task*Valence))+
(Gender*Dosage),data.ex5)

summary(aov.ex5)             
## 
## Error: Subject
##               Df Sum Sq Mean Sq F value Pr(>F)  
## Gender         1  542.3   542.3   5.685 0.0345 *
## Dosage         2  694.9   347.5   3.643 0.0580 .
## Gender:Dosage  2   70.8    35.4   0.371 0.6976  
## Residuals     12 1144.6    95.4                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Error: Subject:Task
##                    Df Sum Sq Mean Sq F value   Pr(>F)    
## Task                1  96.33   96.33  39.862 3.87e-05 ***
## Task:Gender         1   1.33    1.33   0.552    0.472    
## Task:Dosage         2   8.17    4.08   1.690    0.226    
## Task:Gender:Dosage  2   3.17    1.58   0.655    0.537    
## Residuals          12  29.00    2.42                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Error: Subject:Valence
##                       Df Sum Sq Mean Sq F value Pr(>F)  
## Valence                2  14.69   7.343   2.998 0.0688 .
## Valence:Gender         2   3.91   1.954   0.798 0.4619  
## Valence:Dosage         4  20.26   5.065   2.068 0.1166  
## Valence:Gender:Dosage  4   1.04   0.259   0.106 0.9793  
## Residuals             24  58.78   2.449                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Error: Subject:Task:Valence
##                            Df Sum Sq Mean Sq F value Pr(>F)
## Task:Valence                2   5.39  2.6944   1.320  0.286
## Task:Valence:Gender         2   2.17  1.0833   0.531  0.595
## Task:Valence:Dosage         4   2.78  0.6944   0.340  0.848
## Task:Valence:Gender:Dosage  4   2.67  0.6667   0.327  0.857
## Residuals                  24  49.00  2.0417
print(model.tables(aov.ex5,"means"),digits=3)       
## Tables of means
## Grand mean
##          
## 15.62963 
## 
##  Task 
## Task
##     C     F 
## 16.57 14.69 
## 
##  Valence 
## Valence
##   Neg   Neu   Pos 
## 15.28 15.47 16.14 
## 
##  Gender 
## Gender
##     F     M 
## 17.87 13.39 
## 
##  Dosage 
## Dosage
##     A     B     C 
## 14.19 13.50 19.19 
## 
##  Task:Valence 
##     Valence
## Task Neg   Neu   Pos  
##    C 16.00 16.72 17.00
##    F 14.56 14.22 15.28
## 
##  Task:Gender 
##     Gender
## Task F     M    
##    C 18.93 14.22
##    F 16.81 12.56
## 
##  Valence:Gender 
##        Gender
## Valence F     M    
##     Neg 17.67 12.89
##     Neu 17.44 13.50
##     Pos 18.50 13.78
## 
##  Task:Dosage 
##     Dosage
## Task A     B     C    
##    C 14.94 14.83 19.94
##    F 13.44 12.17 18.44
## 
##  Valence:Dosage 
##        Dosage
## Valence A     B     C    
##     Neg 14.25 12.67 18.92
##     Neu 14.25 13.00 19.17
##     Pos 14.08 14.83 19.50
## 
##  Gender:Dosage 
##       Dosage
## Gender A     B     C    
##      F 16.56 16.67 20.39
##      M 11.83 10.33 18.00
## 
##  Task:Valence:Gender 
## , , Gender = F
## 
##     Valence
## Task Neg   Neu   Pos  
##    C 18.44 19.00 19.33
##    F 16.89 15.89 17.67
## 
## , , Gender = M
## 
##     Valence
## Task Neg   Neu   Pos  
##    C 13.56 14.44 14.67
##    F 12.22 12.56 12.89
## 
## 
##  Task:Valence:Dosage 
## , , Dosage = A
## 
##     Valence
## Task Neg   Neu   Pos  
##    C 15.00 15.00 14.83
##    F 13.50 13.50 13.33
## 
## , , Dosage = B
## 
##     Valence
## Task Neg   Neu   Pos  
##    C 13.67 14.83 16.00
##    F 11.67 11.17 13.67
## 
## , , Dosage = C
## 
##     Valence
## Task Neg   Neu   Pos  
##    C 19.33 20.33 20.17
##    F 18.50 18.00 18.83
## 
## 
##  Task:Gender:Dosage 
## , , Dosage = A
## 
##     Gender
## Task F     M    
##    C 17.22 12.67
##    F 15.89 11.00
## 
## , , Dosage = B
## 
##     Gender
## Task F     M    
##    C 18.33 11.33
##    F 15.00  9.33
## 
## , , Dosage = C
## 
##     Gender
## Task F     M    
##    C 21.22 18.67
##    F 19.56 17.33
## 
## 
##  Valence:Gender:Dosage 
## , , Dosage = A
## 
##        Gender
## Valence F     M    
##     Neg 16.67 11.83
##     Neu 16.33 12.17
##     Pos 16.67 11.50
## 
## , , Dosage = B
## 
##        Gender
## Valence F     M    
##     Neg 16.17  9.17
##     Neu 15.83 10.17
##     Pos 18.00 11.67
## 
## , , Dosage = C
## 
##        Gender
## Valence F     M    
##     Neg 20.17 17.67
##     Neu 20.17 18.17
##     Pos 20.83 18.17
## 
## 
##  Task:Valence:Gender:Dosage 
## , , Gender = F, Dosage = A
## 
##     Valence
## Task Neg   Neu   Pos  
##    C 17.33 17.33 17.00
##    F 16.00 15.33 16.33
## 
## , , Gender = M, Dosage = A
## 
##     Valence
## Task Neg   Neu   Pos  
##    C 12.67 12.67 12.67
##    F 11.00 11.67 10.33
## 
## , , Gender = F, Dosage = B
## 
##     Valence
## Task Neg   Neu   Pos  
##    C 17.33 18.00 19.67
##    F 15.00 13.67 16.33
## 
## , , Gender = M, Dosage = B
## 
##     Valence
## Task Neg   Neu   Pos  
##    C 10.00 11.67 12.33
##    F  8.33  8.67 11.00
## 
## , , Gender = F, Dosage = C
## 
##     Valence
## Task Neg   Neu   Pos  
##    C 20.67 21.67 21.33
##    F 19.67 18.67 20.33
## 
## , , Gender = M, Dosage = C
## 
##     Valence
## Task Neg   Neu   Pos  
##    C 18.00 19.00 19.00
##    F 17.33 17.33 17.33
    #report the means and the number of subjects/cell
boxplot(Recall~Task*Valence*Gender*Dosage,data=data.ex5) 

    #graphical summary of means of the 36 cells
boxplot(Recall~Task*Valence*Dosage,data=data.ex5)

    #graphical summary of means of  18 cells

Linear Regression

Many statistics used by psychologists and social scientists are special cases of the linear model (e.g., ANOVA is merely the linear model applied to categorical predictors). Generalizations of the linear model include an even wider range of statistical models.

Consider the following models:

y~x or y~1+x are both examples of simple linear regression with an implicit or explicit intercept. y~0+x or y~ -1 +x or y~ x-1 linear regression through the origin y ~ A where A is a matrix of categorical factors is a classic ANOVA model. y ~ A + x is ANOVA with x as a covariate y ~AB or y~ A + B + AB ANOVA with interaction terms

These models can be fitted with the linear model function (lm) and then various summary statistics are available of the fit. The data set is our familiar set of Eysenck Personality Inventory and Big Five Inventory scales with Beck Depression and state and trait anxiety scales as well. The first analysis just regresses BDI on EPI Neuroticism, then we add in Trait Anxiety, and then the N * trait Anx interaction. Note that we need to 0 center the predictors when we have an interaction term if we expect to interpret the additive effects correctly. Centering is done with the scale () function. Graphical summaries of the regession show four plots: residuals as a function of the fitted values, standard errors of the residuals, a plot of the residuals versus a normal distribution, and finally, a plot of the leverage of subjects to determine outliers. Models 5 and 6 predict bdi using the BFI, and model 7 (for too much fitting) looks at the epi and bfi and the interactions of their scales. What follows are the commands for a number of demonstrations. Samples of the commands and the output may be found in the regression page. Further examples show how to find regressions for multiple dependent variables or to find the regression weights from the correlation matrix rather than the raw data.

datafilename="http://personality-project.org/r/datasets/maps.mixx.epi.bfi.data"  
    #where are the data
personality.data =read.table(datafilename,header=TRUE)  #read the data file
names(personality.data)     #what variables are in the data set?
##  [1] "epiE"     "epiS"     "epiImp"   "epilie"   "epiNeur"  "bfagree" 
##  [7] "bfcon"    "bfext"    "bfneur"   "bfopen"   "bdi"      "traitanx"
## [13] "stateanx"
attach(personality.data)    #make the variables easier to use
model1 = lm(bdi~epiNeur)  #simple regression of  beck depression on Neuroticism
summary(model1)   # basic statistical summary
## 
## Call:
## lm(formula = bdi ~ epiNeur)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.9548  -3.1577  -0.7707   2.0452  16.4092 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.32129    0.73070   -0.44    0.661    
## epiNeur      0.68200    0.06353   10.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.721 on 229 degrees of freedom
## Multiple R-squared:  0.3348, Adjusted R-squared:  0.3319 
## F-statistic: 115.3 on 1 and 229 DF,  p-value: < 2.2e-16
                  #pass parameters to the graphics device
op <- par(mfrow = c(2, 2), # 2 x 2 pictures on one plot
               pty = "s")       # square plotting region,
                                # independent of device size

    
plot(model1)     #diagnostic plots in the graphics window

model2=lm(bdi~epiNeur+traitanx)    #add in trait anxiety
summary(model2)    #basic output
## 
## Call:
## lm(formula = bdi ~ epiNeur + traitanx)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.0288  -2.6848  -0.5121   1.9823  13.3081 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.63779    1.24729  -6.124 3.97e-09 ***
## epiNeur      0.25506    0.08448   3.019  0.00282 ** 
## traitanx     0.30151    0.04347   6.935 4.13e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.299 on 228 degrees of freedom
## Multiple R-squared:  0.4507, Adjusted R-squared:  0.4459 
## F-statistic: 93.53 on 2 and 228 DF,  p-value: < 2.2e-16
plot(model2)

anova(model1,model2)             #compare the difference between the two models
## Analysis of Variance Table
## 
## Model 1: bdi ~ epiNeur
## Model 2: bdi ~ epiNeur + traitanx
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    229 5103.3                                  
## 2    228 4214.3  1    889.08 48.101 4.133e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model2.5=lm(bdi~epiNeur*traitanx) 
    #test for the interaction, note that the main effects are incorrect
summary(model2.5)               #because we need to 0 center the data
## 
## Call:
## lm(formula = bdi ~ epiNeur * traitanx)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.353  -2.625  -0.663   1.930  13.496 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      -4.520422   2.751136  -1.643   0.1017  
## epiNeur          -0.012794   0.227024  -0.056   0.9551  
## traitanx          0.210860   0.083505   2.525   0.0122 *
## epiNeur:traitanx  0.007290   0.005736   1.271   0.2051  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.293 on 227 degrees of freedom
## Multiple R-squared:  0.4546, Adjusted R-squared:  0.4474 
## F-statistic: 63.06 on 3 and 227 DF,  p-value: < 2.2e-16
anova(model2,model2.5)          #compare the two models 
## Analysis of Variance Table
## 
## Model 1: bdi ~ epiNeur + traitanx
## Model 2: bdi ~ epiNeur * traitanx
##   Res.Df    RSS Df Sum of Sq     F Pr(>F)
## 1    228 4214.3                          
## 2    227 4184.5  1    29.771 1.615 0.2051
                                #rescale the data to do the analysis  
cneur=scale(epiNeur,scale=F)    #0 center epiNeur
zneur=scale(epiNeur,scale=T)     #standardize epiNeur
ctrait = scale(traitanx,scale=F) #0 center traitAnx

model3=lm(bdi~cneur+ctrait+cneur*ctrait)
summary(model3)                    #explicitly list the additive and interactive terms
## 
## Call:
## lm(formula = bdi ~ cneur + ctrait + cneur * ctrait)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.353  -2.625  -0.663   1.930  13.496 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.532458   0.342787  19.057  < 2e-16 ***
## cneur        0.271581   0.085363   3.181  0.00167 ** 
## ctrait       0.286759   0.044940   6.381 9.78e-10 ***
## cneur:ctrait 0.007290   0.005736   1.271  0.20509    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.293 on 227 degrees of freedom
## Multiple R-squared:  0.4546, Adjusted R-squared:  0.4474 
## F-statistic: 63.06 on 3 and 227 DF,  p-value: < 2.2e-16
plot(model3)

model4=lm(bdi~cneur*ctrait)
summary(model4)                    #note how this is exactly the same as the previous model
## 
## Call:
## lm(formula = bdi ~ cneur * ctrait)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.353  -2.625  -0.663   1.930  13.496 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.532458   0.342787  19.057  < 2e-16 ***
## cneur        0.271581   0.085363   3.181  0.00167 ** 
## ctrait       0.286759   0.044940   6.381 9.78e-10 ***
## cneur:ctrait 0.007290   0.005736   1.271  0.20509    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.293 on 227 degrees of freedom
## Multiple R-squared:  0.4546, Adjusted R-squared:  0.4474 
## F-statistic: 63.06 on 3 and 227 DF,  p-value: < 2.2e-16
epi=cbind(epiS,epiImp,epilie,epiNeur) 
    #form a new variable "epi" without overall extraversion
epi=as.matrix(epi)      #actually, more useful to treat this variable as a matrix

bfi=as.matrix(cbind(bfext,bfneur,bfagree,bfcon,bfopen))    #create bfi as a matrix as well
epi=scale(epi,scale=T)   #standardize the epi
bfi=scale(bfi,scale=T)   #standardize the bfi

model5=lm(bdi~bfi)                 #model beck depression by the Big 5 inventory
summary(model5)
## 
## Call:
## lm(formula = bdi ~ bfi)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2635  -3.1809  -0.6355   2.1409  19.4566 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.7792     0.3239  20.933  < 2e-16 ***
## bfibfext     -0.3519     0.3929  -0.896   0.3713    
## bfibfneur     3.0576     0.3459   8.840 2.81e-16 ***
## bfibfagree    0.2932     0.4094   0.716   0.4746    
## bfibfcon     -0.8968     0.3680  -2.437   0.0156 *  
## bfibfopen    -1.0197     0.4013  -2.541   0.0117 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.922 on 225 degrees of freedom
## Multiple R-squared:  0.2894, Adjusted R-squared:  0.2736 
## F-statistic: 18.33 on 5 and 225 DF,  p-value: 2.975e-15
model6=lm(bdi~bfi+epi)          
summary(model6)
## 
## Call:
## lm(formula = bdi ~ bfi + epi)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.7001  -2.7939  -0.6687   2.2363  16.3576 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.77922    0.30055  22.556  < 2e-16 ***
## bfibfext     0.32262    0.45645   0.707  0.48044    
## bfibfneur    1.35905    0.41935   3.241  0.00138 ** 
## bfibfagree   0.06383    0.38805   0.164  0.86950    
## bfibfcon    -0.66832    0.37153  -1.799  0.07341 .  
## bfibfopen   -1.00298    0.37404  -2.681  0.00788 ** 
## epiepiS      0.09068    0.39339   0.231  0.81791    
## epiepiImp   -0.62218    0.36787  -1.691  0.09219 .  
## epiepilie   -0.26893    0.33323  -0.807  0.42050    
## epiepiNeur   2.45393    0.41509   5.912 1.27e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.568 on 221 degrees of freedom
## Multiple R-squared:  0.3989, Adjusted R-squared:  0.3744 
## F-statistic:  16.3 on 9 and 221 DF,  p-value: < 2.2e-16
model7 = lm(bdi~bfi*epi)          
    #additive model of epi and bfi as well as the interactions between the sets
summary(model7)                   #given as an example of overfitting
## 
## Call:
## lm(formula = bdi ~ bfi * epi)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.0191  -2.6525  -0.5497   1.9093  14.9430 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            6.279425   0.423041  14.844  < 2e-16 ***
## bfibfext               0.206300   0.493871   0.418   0.6766    
## bfibfneur              1.158978   0.451906   2.565   0.0111 *  
## bfibfagree             0.002001   0.417169   0.005   0.9962    
## bfibfcon              -0.675954   0.393181  -1.719   0.0871 .  
## bfibfopen             -0.685177   0.416177  -1.646   0.1013    
## epiepiS                0.235251   0.430179   0.547   0.5851    
## epiepiImp             -0.593336   0.396046  -1.498   0.1357    
## epiepilie             -0.228319   0.365699  -0.624   0.5331    
## epiepiNeur             2.655196   0.432756   6.136 4.44e-09 ***
## bfibfext:epiepiS       0.236116   0.458918   0.515   0.6075    
## bfibfneur:epiepiS      0.319599   0.403803   0.791   0.4296    
## bfibfagree:epiepiS    -0.263338   0.484497  -0.544   0.5874    
## bfibfcon:epiepiS      -0.055400   0.413468  -0.134   0.8935    
## bfibfopen:epiepiS      0.144438   0.496949   0.291   0.7716    
## bfibfext:epiepiImp    -0.065013   0.463085  -0.140   0.8885    
## bfibfneur:epiepiImp   -0.150381   0.393607  -0.382   0.7028    
## bfibfagree:epiepiImp   0.335234   0.478316   0.701   0.4842    
## bfibfcon:epiepiImp     0.021982   0.394558   0.056   0.9556    
## bfibfopen:epiepiImp    0.355919   0.539411   0.660   0.5101    
## bfibfext:epiepilie    -0.053018   0.424651  -0.125   0.9008    
## bfibfneur:epiepilie   -0.088985   0.420050  -0.212   0.8324    
## bfibfagree:epiepilie   0.474549   0.474784   1.000   0.3188    
## bfibfcon:epiepilie     0.026155   0.382927   0.068   0.9456    
## bfibfopen:epiepilie   -0.086944   0.463418  -0.188   0.8514    
## bfibfext:epiepiNeur    0.700558   0.460097   1.523   0.1294    
## bfibfneur:epiepiNeur   0.579644   0.369567   1.568   0.1184    
## bfibfagree:epiepiNeur -0.475656   0.442734  -1.074   0.2839    
## bfibfcon:epiepiNeur   -0.070141   0.460483  -0.152   0.8791    
## bfibfopen:epiepiNeur  -0.213680   0.417856  -0.511   0.6097    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.607 on 201 degrees of freedom
## Multiple R-squared:  0.444,  Adjusted R-squared:  0.3638 
## F-statistic: 5.534 on 29 and 201 DF,  p-value: 5.879e-14
 ## At end of plotting, reset to previous settings:
     par(op)