0) imort helpful libraries/functions

#functino with dimx(), info(), LS(), look(), source() 
setwd("C:/Users/Dani Grant/Dropbox/graduate school records/fall 2021/R programming/final")
source("http://www.matthewckeller.com/R.Class/KellerScript2.R") 
library(psych)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x ggplot2::%+%()   masks psych::%+%()
## x ggplot2::alpha() masks psych::alpha()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:psych':
## 
##     logit

1) FUN WITH FUNCTIONS AND DATA MANIPULATION

## Warning in xtfrm.data.frame(x): cannot xtfrm data frames

a)

Write a function called “insert” which inserts a column or row (depending on an argument from the user) into a data.frame or a matrix at some specified position (also an argument from the user). E.g., if I did this:

newdat <- insert(dataset = x, inserted.vector = z, pos = 5, col = TRUE)

I would want the function to insert the vector z into the 5th column of x, shifting all the other columns over to the right. For extra credit: when col=TRUE, ensure that the name of the vector (“z” in this case) is also the name of the column in the returned dataset. E.g., in the above example, the 5th column of newdat would be “z” rather than “inserted.vector”. For a hint for doing this extra credit, google, “In R, how to get an object’s name after it is sent to a function?”

insert <- function(inserted.vector, d, pos = 1, col = TRUE){
    
  if (pos <= 0) stop("cant put column or row into the 0th or less position")
  if (pos %% 1 != 0) stop("Your pos value was not an integer")
  
  ############## DATA FRAME ################
  if(class(d)[1] == "data.frame"){ 
    
    if(col == TRUE){
      
      if (length(names(d)) < pos) stop("pos arg = more than cols in df")

      d[, (ncol(d)+1)] <- inserted.vector #place column at end of data.frame
      currentPos <- length(names(d)) 
    if (pos == 1) {
      colOrder <- c(currentPos, 1:(currentPos - 1)) #if pos =1 
      } else if(pos == currentPos) {
        colOrder <- 1:currentPos # If pos = ncol(data)
        } else { 
          firstHalf <- 1:(pos - 1) # Finding the first half on columns that come before pos
          secondHalf <- pos:(currentPos - 1) # Getting the second half, which comes after pos
          colOrder <- c(firstHalf, currentPos, secondHalf)} # Putting that order together
      d <- subset(d, select = colOrder)  # Reordering the data
      names(d)[pos] <- c(deparse(substitute(inserted.vector)))
      return(d)
      }
    
    else{
      
      if (nrow(d) < pos) stop("pos arg = more than rows in df")
      
      d <- rbind.data.frame(d, inserted.vector)  #place row at end of data.frame
      currentPos <- nrow(d) 
  
    if(pos == 1){
      d <- rbind.data.frame(d[currentPos,], d[1:(currentPos - 1),]) #if pos =1 
    }else if (pos == currentPos){
      d <- rbind.data.frame(d[1:(currentPos - 1),], d[currentPos,]) # If pos = nrow(data)
    }else{ 
      moveRow <- d[currentPos,]
      firstHalf <- rbind.data.frame(d[1:(pos - 1),]) # Finding the first half on columns that come before pos
      secondHalf <- rbind.data.frame(d[pos:(currentPos - 1),]) # Getting the second half, which comes after pos
      d <- rbind.data.frame(firstHalf, moveRow, secondHalf) # Putting that order together
    }
  
  return(d)
      }
    
    }else if(class(d)[1] == "matrix"){ #if it's a matrix
    
    if(col == TRUE){
      
      if (ncol(d) < pos) stop("pos arg = more than cols in matrix")

      d <- cbind(d,inserted.vector)  #place column at end of the matrix
      currentPos <- ncol(d) #get total columns
      if (pos == 1){
        colOrder <- c(currentPos, 1:(currentPos - 1)) #if pos=1 
        }else if(pos == currentPos) {
          colOrder <- 1:currentPos # If pos = ncol(matrix)
          }else { 
            firstHalf <- 1:(pos - 1) # Finding the first half on columns that come before pos
            secondHalf <- pos:(currentPos - 1) # Getting the second half, which comes after pos
            colOrder <- c(firstHalf, currentPos, secondHalf) # Putting that order together
            }
      d <- subset(d, select = colOrder)  # Reordering the data
      colnames(d)[pos] <- c(deparse(substitute(inserted.vector)))

      return(d)
    
    }else{
      
      if (nrow(d) < pos) stop("pos arg = more than rows in matrix")

      d <- rbind(d, inserted.vector)  #place row at end of data.frame
      currentPos <- nrow(d) #get total rows
    
    if (pos == 1){
      d <- rbind(d[currentPos,], d[1:(currentPos - 1),]) #if pos=1 
      }else if (pos == currentPos) {
        d <- rbind(d[1:(currentPos - 1),], d[currentPos,]) # If pos = nrow(data)
        }else{ 
          moveRow <- d[currentPos,]
          firstHalf <- rbind(d[1:(pos - 1),]) # Finding the first half on columns that come before pos
          secondHalf <- rbind(d[pos:(currentPos - 1),]) # Getting the second half, which comes after pos
          d <- rbind(firstHalf, moveRow, secondHalf) # Putting that order together
          }
      return(d)
      }
    }else{
      stop("data must be a matrix or dataframe")
    }
  }

b)

Write a function called “groups” that counts the number of occurrences of each unique element of some factor (x) and puts those numbers in a vector of the same length as x. For example: x <- c(1,2,2,2,4,5,2,3,1) –> groups(x) should return: 2 4 4 4 1 1 4 1 2. function should wok for numeric, factor, or character classes. Do not use a loop to accomplish this, as that is too slow for large datasets. This is a hard one. Hint: try (z <- table(x)) (zz <- as.data.frame(z))

groups <- function(v){
  
  if (class(v) == 'data.drame') stop("v argument must be a vector!")
  if (class(v)[1] == 'matrix') stop("v argument must be a vector!")

  if(class(v) == "factor"){
    v <- as.character(v)
    count <- ave(v, v, FUN = length)
  }
  else{
    count <- ave(v, v, FUN = length)
  }
  return(as.numeric(count))
}

c)

Use your function insert() to insert new.dat into the 50th row of Fam.data. Call this new dataset “Fam.data1”

Fam.data1 <- insert(inserted.vector = new.dat, d = Fam.data, pos = 50, col = FALSE)
new.dat == Fam.data1[50,]
##     FamID PersID   ht   wt   ex
## 872  TRUE   TRUE TRUE TRUE TRUE

d)

merge Fam.data1 and New.Fam.Data. Call this new dataset “Fam.data2”. Include all data from both datasets. If an individual is missing in one dataset, make their value “NA” in the merged data. Make sure to output the number of rows of Fam.data2 into the rendered file.

#merge all data & if indiv missing in 1 dataset = NA in merged data
Fam.data2 = merge(Fam.data1, New.Fam.Data, 
                            by = c('FamID', 'PersID'), 
                            all = T)

#include row number into rendered file
subject <- 1:nrow(Fam.data2)
Fam.data2 <- insert(subject, Fam.data2, pos = 1, col = T)

head(Fam.data2)
##   subject FamID PersID       ht        wt ex        cne
## 1       1  A162 G13235 53.99334 157.52228  2 -0.5964615
## 2       2  A169 L55321 65.91020 237.74834  3  1.5389759
## 3       3  A169 T29481 60.85398 173.99635  2 -0.9603961
## 4       4  A353 A24998 51.27930 155.18526  4 -0.1690118
## 5       5  A353 A76680 63.65348 172.20203  0  0.5563925
## 6       6  A353 B39700 46.13513  83.73244  1 -1.5282779
length(Fam.data2$subject)
## [1] 2010

e)

Use your function groups() to create a vector to find out how many people are in each family in Fam.data2 (Famid is the ID for families). Record how long it took your function groups() to accomplish this task

count <- groups(Fam.data2$FamID)
(time.group <- system.time(groups(Fam.data2$FamID)))
##    user  system elapsed 
##    0.01    0.00    0.02

f)

Use your function insert() to insert this vector (the number of indivduals per family you just created above) into the third column of Fam.data2. Call this new dataset Fam.data3

head(Fam.data3 <- insert(count, Fam.data2, pos = 3, col = T))
##   subject FamID count PersID       ht        wt ex        cne
## 1       1  A162     1 G13235 53.99334 157.52228  2 -0.5964615
## 2       2  A169     2 L55321 65.91020 237.74834  3  1.5389759
## 3       3  A169     2 T29481 60.85398 173.99635  2 -0.9603961
## 4       4  A353     7 A24998 51.27930 155.18526  4 -0.1690118
## 5       5  A353     7 A76680 63.65348 172.20203  0  0.5563925
## 6       6  A353     7 B39700 46.13513  83.73244  1 -1.5282779

g)

Sort the Fam.data3 so that families are together, and such that the largest families are on top. Call this new dataset Fam.data4

Fam.data4 <- Fam.data3[order(Fam.data3$count, decreasing = FALSE),]

h)

(Challenging) If a family has 6 members, remove 1 family member and if they have 7 remove 2 (i.e., remove that row). Make sure to selectively remove those who have missing data in the cne variable. Otherwise, remove them at random. Call this new data.frame “Fam.data.reduced”. Make sure NOT to hard code this - your code should work if we recreated Fam.data above with a new seed, such that the numbers in each family, missingness, etc. were different (although you can assume that the number per family is always <= 7). Finally, check your answer by showing output in the rendered file of summary(as.factor(groups(Fam.data.reduced$FamID)))

Hint: there are a million ways to do this. One might be to create datasets of 5 or fewer family members, 6 family members, and another of 7 family members. Then work on the latter two and then rbind them back together at the end. For the 7 family member one, you can randomize order within family by something like: rand <- runif(n) ord <- order(famid,rand) Also, if you put NA’s in rand wherever they exist in cne, those will always come at the end of each family. You can also do this by looping through famid’s, and for each family, deciding which ones to drop if they have 6 or 7 members.

#subset data set with 3 new = 5 or fewer, 6, and 7 family members. 
fam.data.less5 <- Fam.data4[Fam.data4$count <= 5,]
fam.data.6 <- Fam.data4[Fam.data4$count == 6,]
fam.data.7 <- Fam.data4[Fam.data4$count == 7,]

#for families of 6, remove 1
fam.data.6 <- fam.data.6[order(fam.data.6$count, decreasing = FALSE),]
rand <- round(runif(length(fam.data.6$FamID), min = 1, max = 6), 0) #rand order 1-6
rand[][is.na(fam.data.6$cne)] <- NA #remove cne == NA
fam.data.6 <- fam.data.6[order(fam.data.6$FamID, rand),] #randomly order within each family
fam.data.6$delete<-c(T,T,T,T,T,F) #first, then remove at random
fam.data.6 <- fam.data.6[fam.data.6$delete != F,]
fam.data.6$count <- 5
fam.data.6$delete <- NULL

#for families of 7, remove 2
fam.data.7 <- fam.data.7[order(fam.data.7$count, decreasing = FALSE),]
rand <- round(runif(length(fam.data.7$FamID), min = 1, max = 7), 0) #rand order 1-6
rand[][is.na(fam.data.7$cne)] <- NA
fam.data.7 <- fam.data.7[order(fam.data.7$FamID, rand),] #randomly order within each family
fam.data.7$delete<-c(T,T,T,T,T,F,F)
fam.data.7 <- fam.data.7[fam.data.7$delete != F,]
fam.data.7$count <- 5
fam.data.7$delete <- NULL

Fam.data.reduced <- rbind.data.frame(fam.data.7, fam.data.6, fam.data.less5)

summary(as.factor(groups(Fam.data.reduced$FamID)))
##    1    2    3    4    5 
##   76  146  186  268 1110

2) MULTI-LEVEL ANALYSES IN R

Q: Does keeping mentally active and engaging in challenging activities helps stave off cognitive decline among persons of retirement age?

N ~ 300+ people, age: 65-75 longitudinal: 50 months

Group A: told not to do any particular activities Group B: randomly assigned group is assigned to play the computer game “Fortnite” for 6 hrs/week Group C: assigned to play card games with friends 6 hours per week

beginning of month, participants log into the study website and take an Executive Function (EF) test. raw data = 302 flat files in the folder “FinalData”, Each file = 50 EF measures (w1-w50) for each pt each file = “X.[subject number].[group]”

Your job as a researcher = understand the effects of the 3 conditions on cognitive decline. i) does either intervention stave off cognitive decline? ii) do these effects depend on on gender (e.g., does playing Fortnite help males more than females?) iii) do people with more friends have higher or lower EF at start? iv) Does number of friends have an effect on cognitive decline?

You might notice that this is a very typical “hierarchical” model, something we did not cover in this course. I’m going to ask you to analyze this anyway. I’ll help you through. The way we’ll do it is NOT necessarily the best way to do it, which would involve using HLM (e.g., using the lme4 module). Nevertheless, we’ll go through a very intuitive way to think about what these analyses are all about, and I suggest that this intuitive method we use below is the first way you should always analyze HLM data, before doing it ‘properly’ using an HLM routine, because the two answers should roughly agree, and if not, then you’ve probably done it the ‘proper’ way improperly!

a)

Unzip the FinalData.zip file to create a “FinalData” folder. In the “FinalData” folder, there are 300+ files. Pull each one into R and merge the data (by subject ID) with the “SubjectInfo.txt” data file, which is also in the folder (note that there are fewer files than rows in SubjectInfo.txt - some subject withdrew from the study at some point). The variable “friends” in SubjectInfo.txt denotes “number of close friends who you can confide in and who you talk to at least monthly”. If subject data is not available in the individual EF files, throw out those subjects

Hint: try these lines of code first: files <- list.files() files <- files[-1] subinfo <- read.table(“SubjectInfo.txt”,header=TRUE) Now just loop through “files”, reading each file into R using read.table, and storing each of those as a row in a matrix that you start as an empty matrix and then fill up. I.e., I would set this up as a “wide” formatted data, with each row a person and each column a measurement of that person

subInfo <- read.table("C:/Users/Dani Grant/Dropbox/graduate school records/fall 2021/R programming/FinalData/SubjectInfo.txt", header = T)

path <- "C:/Users/Dani Grant/Dropbox/graduate school records/fall 2021/R programming/FinalData/"

files <- list.files(paste(path, sep = ""))
files <- files[-1]
files <- data.frame(files)

f <- files %>% separate(files, c("X", "subject", "group"), "[.]")

s <- cbind.data.frame(rep(NA, 50))

for (i in 1:nrow(files)){#pull 302
  subData <- read.table(paste(path, files[i,], sep = ""), header = F)
  s[,i] <- subData
  colnames(s)[i] <- paste("SUB", f$subject[i], sep = "")
}

s_wide <- s #save subjects by column

#transpose s to make subjects by row
s <- as.data.frame(t(as.matrix(s)))
id <- f$subject
s <- insert(id, s, pos = 1, col = T)
s$id <- paste("SUB", s$id, sep = "")

#merge by subject ID with SubjectInfo
s <- merge(subInfo, s, by = "id")

b)

perform a regression for each individual subject, where you regress the 50 EF scores on time (time <- 1:50). Save the intercept and slope for each subject in a matrix called “regdat”. These will be our dependent (y) variables later. However, you don’t want to save ‘bad’ regression data.

EF individual scores ~ time int + slope saved in matrix regdat

Thus, you want to make any data point that has a high cook’s D (cookd > .3) be missing. After that, you should then rerun the regression analysis, saving the intercept and slope from after the highly influential points have been dropped. Finally, if any subject is missing 6 or more EF scores in total (including those you’ve dropped), that subject’s regression data (interept & slope) should be set to “NA”. You’ll need to automate all this in a loop.

collect each model’s cooksD, any above .3 <- NA

i) A few hints: see the function cooks.distance() in the car library

ii) you’ll probably be doing lm() on each row of a dataframe. If so, you’ll need to convert your row to a class of object that R can do lm() on. Also, you can’t do a lm(y~1:50) for example. You’ll first need to make 1:50 a vector outside of the lm()

iii) it will be a challenge to figure out how to make the right point(s) missing that has/have a high cookd, given the missingness in the data. There are probably many ways to do this. I did it by making a vector of the cook’s d values, and using the names of each element (which is consistent, regardless of missingness) to find the right element to set to NA, rather than the element itself (which is off if there is missingness before that point).

#matrix(regdat) <- save int and slope
regdat <- matrix(NA, nrow = ncol(s_wide), ncol = 2 ,byrow = T)
colnames(regdat) <- c("intercept", "slope")#I tried to use dimnames = list(rownames, colnames), but it wouldn't knit to html, but runs ok un Rmd??

#regression for each pt lm(time ~ 50 EF scores)
time <- 1:50

for(i in 1:ncol(s_wide)){
  model <- lm(eval(parse(text = paste("s_wide$", colnames(s_wide[i]), sep = ""))) ~ time)
  regdat[i,1] <- model$coefficients[1]#int
  regdat[i,2] <- model$coefficients[2]#slope
  
  cookd <- data.frame(cooks.distance(model))

  for(j in 1:nrow(cookd)){
    if(cookd$cooks.distance.model.[j] > 0.3){ #discard high cookd > .3 
      s_wide[j,i] = NA}}}



#rerun regression, re-save; if any subject missing 6+ scores, pt regression data = NA
regdat.trim <- matrix(NA, nrow = ncol(s_wide), ncol = 2, byrow = TRUE)
colnames(regdat.trim) <- c("intercept", "slope")#I tried to use dimnames = list(rownames, colnames), but it wouldn't knit to html, but runs ok in Rmd??

for(i in 1:ncol(s_wide)){
  model <- lm(eval(parse(text = paste("s_wide$", colnames(s_wide[i]), sep = ""))) ~ time)
  
  regdat.trim[i,1] <- model$coefficients[1]#int
  regdat.trim[i,2] <- model$coefficients[2]#slope
  
  #if(rowSums(is.na(s[i,5:54])) > 5){regdat.trim[i,] = NA}
  }

# adding new columns for future question
group <- f$group
slope <- regdat.trim[,2]
intercept <- regdat.trim[,1]

s <- insert(group, s, pos = 5, col = T)
s <- insert(intercept, s, pos = 6, col = T)
s <- insert(slope, s, pos = 7, col = T)

iv) for me to check your answer on this part, make sure to print out the summary(regdat) (along with your R code) in your rendered file (if you are using R Markdown).

summary(regdat) #original regdat without cookd exclusions
##    intercept          slope           
##  Min.   : 94.44   Min.   :-0.4220441  
##  1st Qu.:101.92   1st Qu.:-0.0759646  
##  Median :104.70   Median : 0.0012761  
##  Mean   :104.98   Mean   :-0.0000377  
##  3rd Qu.:108.12   3rd Qu.: 0.0736081  
##  Max.   :118.60   Max.   : 0.3238296
summary(regdat.trim)#with cookd exclusions & NAs for people >= 6 NAs
##    intercept          slope           
##  Min.   : 94.44   Min.   :-0.4220441  
##  1st Qu.:101.92   1st Qu.:-0.0729087  
##  Median :104.70   Median : 0.0012761  
##  Mean   :104.96   Mean   : 0.0005028  
##  3rd Qu.:108.10   3rd Qu.: 0.0715691  
##  Max.   :118.60   Max.   : 0.3238296

c)

Before we get to analyzing the slopes (cognitive incline/decline) and intercepts (starting points), we’d like to visualize a scatterplot for each individual. Create a PDF with as many pages as # of subjects. Each page should be a scatterplot of that subjects’ time vs. EF. Also:

i) make the points blue if male and red if subject is female

ii) make the title of each graph be the subject’s ID number followed by whether that subject is group A, B, or C

iii) place the best fit line in each graph, in the red or blue color

iv) place the text “B0 = xx” and “B1 = xx” in each graph, where xx is the correct intercept or slope. Try to place this in the same location (always in the upper left) of each graph. To do this, try using the vector that comes from the following syntax, which you should run after a plot has been produced

v) Make sure to plot each subject, even if they’re missing in the data above.

vi) If using R Markdown, include the R syntax for how to do this in the rendered file, and place only the FIRST of these plots in your rendered file, as an example. DO NOT place all ~300 plots in the rendered file! In either case, your R code should be able to create a single 300-page PDF.

destination = "C:/Users/Dani Grant/Dropbox/graduate school records/fall 2021/R programming/myGraphs.pdf"

par(mfrow = c(1,1))
#lim <- par('usr')

pdf(file = destination)

for(i in 1:ncol(s_wide)){
  
  model <- lm(eval(parse(text = paste("s_wide$", colnames(s_wide[i]), sep = ""))) ~ time)
  
  if(s$gender[i] == "M"){c <- "blue"}else{c <- "pink"} #gender dot colors

  plot(s_wide[,i] ~ time, 
       ylab = "EF scores", 
       col = c,
       main = paste(names(s_wide[i]), "group", s$group[i], sep = ".")) #ID & group
  abline(lm(s_wide[,i] ~ time))
  lim = par('usr')
  mtext(paste0("B0 = ", round(model$coefficients[1],2), 
               "\nB1 = ", round(model$coefficients[2],2)), outer = F, side = 3, line = -3, adj = .04) 
}

dev.off()
## png 
##   2

d)

now analyze the main data using multiple regression. Each subject’s slopes will be a dependent variable, giving us info on cognitive change. Each subject’s intercepts will be another dependent variable in a separate regression model, giving us info on where each person started before the intervention. Answer the questions:

s$MvF <- ifelse(s$gender == "M", -.5, .5)

s$friends.c <- s$friends - mean(s$friends, na.rm = T)
#s$friends.1sd <- s$friends.c - sd(s$friends, na.rm = T)

s$age.c <- s$age - mean(s$age, na.rm = T)

### dummy codes
s$B_1[s$group == 'A'] <- 0 # A = no activity
s$B_1[s$group == 'B'] <- 1 # B = Fortnite 6 hr/week
s$B_1[s$group == 'C'] <- 0 # C = games with friends 6 hr/week

s$C_1[s$group == 'A'] <- 0 
s$C_1[s$group == 'B'] <- 0
s$C_1[s$group == 'C'] <- 1

s$A_1[s$group == 'A'] <- 1 
s$A_1[s$group == 'B'] <- 0
s$A_1[s$group == 'C'] <- 0

### contrast codes A v BC ############# I'm better at interpreting these contrast codes, so I am using them to check my work
s$AvBC[s$group == 'A'] <- -1 
s$AvBC[s$group == 'B'] <- .5
s$AvBC[s$group == 'C'] <- .5

s$BvC[s$group == 'A'] <- 0 
s$BvC[s$group == 'B'] <- -.5
s$BvC[s$group == 'C'] <- .5

i) does either intervention stave off cognitive decline? Please test the contrasts of A vs. B and A vs. C (dummy codes), and control for gender and number of friends

#intercept
(modelA1int <- summary(lm(intercept ~ B_1 + C_1 + MvF + age.c + friends.c, data = s)))
## 
## Call:
## lm(formula = intercept ~ B_1 + C_1 + MvF + age.c + friends.c, 
##     data = s)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.9580 -2.6560  0.0097  2.7203 14.8264 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 105.22733    0.40218 261.644  < 2e-16 ***
## B_1          -0.84313    0.60158  -1.402 0.162107    
## C_1          -0.20655    0.56970  -0.363 0.717191    
## MvF           1.78817    0.48583   3.681 0.000276 ***
## age.c        -0.10409    0.07343  -1.418 0.157356    
## friends.c     0.50551    0.10017   5.046 7.87e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.174 on 296 degrees of freedom
## Multiple R-squared:  0.136,  Adjusted R-squared:  0.1214 
## F-statistic: 9.317 on 5 and 296 DF,  p-value: 2.956e-08
#slope
(modelA1slope <- summary(lm(slope ~ B_1 + C_1 + MvF + age.c + friends.c, data = s)))
## 
## Call:
## lm(formula = slope ~ B_1 + C_1 + MvF + age.c + friends.c, data = s)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32459 -0.05856 -0.00247  0.06361  0.37153 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.138e-02  1.022e-02  -5.030 8.54e-07 ***
## B_1          6.413e-02  1.528e-02   4.197 3.58e-05 ***
## C_1          9.518e-02  1.447e-02   6.577 2.17e-10 ***
## MvF         -1.057e-02  1.234e-02  -0.856    0.392    
## age.c       -8.078e-03  1.865e-03  -4.331 2.03e-05 ***
## friends.c   -4.663e-05  2.544e-03  -0.018    0.985    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.106 on 296 degrees of freedom
## Multiple R-squared:  0.1856, Adjusted R-squared:  0.1719 
## F-statistic: 13.49 on 5 and 296 DF,  p-value: 7.253e-12

ANSWER

yes, both interventions stave off cognitive decline, but playing cards with friends staves it off better than playing fortnite.


ii) expand your multiple regression equation above to answer whether these effects depend on on gender (e.g., does playing Fortnite help males more than females?)

(modelA2int <- summary(lm(intercept ~ (B_1 + C_1) * MvF + age.c + friends.c, data = s)))
## 
## Call:
## lm(formula = intercept ~ (B_1 + C_1) * MvF + age.c + friends.c, 
##     data = s)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.1586 -2.9034  0.0888  2.6371 14.3772 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 105.25884    0.40249 261.518  < 2e-16 ***
## B_1          -0.90667    0.60231  -1.505    0.133    
## C_1          -0.23762    0.57015  -0.417    0.677    
## MvF           0.96352    0.80775   1.193    0.234    
## age.c        -0.10969    0.07364  -1.489    0.137    
## friends.c     0.50035    0.10023   4.992 1.03e-06 ***
## B_1:MvF       1.93327    1.20567   1.603    0.110    
## C_1:MvF       0.77003    1.14149   0.675    0.500    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.17 on 294 degrees of freedom
## Multiple R-squared:  0.1435, Adjusted R-squared:  0.1231 
## F-statistic: 7.036 on 7 and 294 DF,  p-value: 9.071e-08
(modelA2slope <- summary(lm(slope ~ (B_1 + C_1) * MvF + age.c + friends.c, data = s)))
## 
## Call:
## lm(formula = slope ~ (B_1 + C_1) * MvF + age.c + friends.c, data = s)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33551 -0.05906 -0.00073  0.05676  0.36190 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.0521972  0.0101883  -5.123 5.45e-07 ***
## B_1          0.0661406  0.0152463   4.338 1.98e-05 ***
## C_1          0.0957625  0.0144324   6.635 1.56e-10 ***
## MvF          0.0102473  0.0204467   0.501   0.6166    
## age.c       -0.0078115  0.0018642  -4.190 3.69e-05 ***
## friends.c    0.0001935  0.0025373   0.076   0.9393    
## B_1:MvF     -0.0619730  0.0305192  -2.031   0.0432 *  
## C_1:MvF     -0.0090939  0.0288948  -0.315   0.7532    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1055 on 294 degrees of freedom
## Multiple R-squared:  0.1983, Adjusted R-squared:  0.1792 
## F-statistic: 10.39 on 7 and 294 DF,  p-value: 1.185e-11

ANSWER

In fortnite condition, effects are marginally influenced by gender, b = -.06, t(273) = -1.95, p = .052. Males do better with this intervention than females. There is no difference between males and females for the playing cards intervention, b = -.008, t(273) = -.28, p = .783


iii) controlling for condition and gender, do people with more friends have higher or lower EF at the start (i.e., are the intercepts associated with number of friends)?

(modelA3int <- summary(lm(intercept ~ AvBC + BvC + MvF + age.c + friends.c, data = s)))
## 
## Call:
## lm(formula = intercept ~ AvBC + BvC + MvF + age.c + friends.c, 
##     data = s)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.9580 -2.6560  0.0097  2.7203 14.8264 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 104.87743    0.24180 433.730  < 2e-16 ***
## AvBC         -0.34989    0.33496  -1.045 0.297070    
## BvC           0.63657    0.60260   1.056 0.291660    
## MvF           1.78817    0.48583   3.681 0.000276 ***
## age.c        -0.10409    0.07343  -1.418 0.157356    
## friends.c     0.50551    0.10017   5.046 7.87e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.174 on 296 degrees of freedom
## Multiple R-squared:  0.136,  Adjusted R-squared:  0.1214 
## F-statistic: 9.317 on 5 and 296 DF,  p-value: 2.956e-08

ANSWER

yes, people who start off with more friends start with higher EF, b = .58, t(275) = 5.56, p < .0001.


iv) controlling for condition and gender, does number of friends have an effect on on cognitive decline (i.e., are slopes associated with number of friends)?. Make sure you use R to graphically check your data and assumptions first, and show me your work. The final stage should be your inferential tests. What are your conclusions?

#run slope model
model.slope <- lm(slope ~ B_1 + C_1 + friends.c + age.c + MvF, data = s)

#run intercept model
model.int <- lm(intercept ~ B_1 + C_1 + friends.c + age.c + MvF, data = s)

# plot data of slopes and friends
plot(s$slope ~ s$friends);abline(lm(s$slope ~ s$friends)) #not much association

#correlation between friends and slopes
cor(s$friends, s$slope, use="complete.obs") #basically no corr
## [1] -0.01776989
#look at variance of groups
tapply(s$friends, s$group, var, na.rm = T) #hm...much more variance in group c
##        A        B        C 
## 5.916580 4.910986 6.667607
###############
# nonlinearity
###############

# component + residual plot
crPlots(model.slope)

crPlots(model.int) #friends seem non-linear for intercept model

#check also for non-linearity. Might be good reason to transform predictor variables. 

####################
# Normality
####################

hist(s$friends, main = "regular friends") #lots of positive skew!

s$friends.log <- log(s$friends)#take log-More for interpretation issue, not assumption issue. 
s$friends.log <- (log(s$friends) + 1)

hist(s$friends.log, main = "log(friends)") #still skewed, but better

#normality of the outcome, in particular the residuals is an assumption. Though, this one is least important, unless you have N < 30. So, I'm not too worried about this because big enough sample!
qqPlot(s$slope, main = "QQ plot of slopes") #seems pretty normal

## [1] 163 282
qqPlot(s$intercept, main = "QQ plot of intercepts") #seems pretty normal

## [1] 192   8
#####################
# homoscedasticity
#####################

# non-constant error variance test
ncvTest(model.slope) #NS
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 1.177305, Df = 1, p = 0.27791
ncvTest(model.int)#NS
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.1473791, Df = 1, p = 0.70105
# plot studentized residuals vs. fitted values
spreadLevelPlot(model.slope) #looks ok, suggest = 1.03
## Warning in spreadLevelPlot.lm(model.slope): 
## 145 negative fitted values removed

## 
## Suggested power transformation:  1.048419
spreadLevelPlot(model.int) #suggest power transformatoin = 1.9

## 
## Suggested power transformation:  0.08484014
####################
# independence of error
###################

#can't know this, but no reason to think the independence of residuals is violated here. 

####################
# Influential observations
####################

outlierTest(model.slope) #nothing of concern
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferroni p
## 188 3.604136         0.00036752      0.11099
outlierTest(model.int) #nothing of concern
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferroni p
## 192 3.650122         0.00030995     0.093605
head(cooks.distance(model.slope)) #no large (> .3) cookD for slope
##            1            2            3            4            5            6 
## 0.0001832786 0.0083915011 0.0114460481 0.0038040042 0.0002909655 0.0002696978
head(cooks.distance(model.int)) #no large (> .3) cookD for intercept
##            1            2            3            4            5            6 
## 0.0062335001 0.0031072869 0.0057524462 0.0072449004 0.0007306838 0.0013934473
#which data points have large influence?
influencePlot(model.slope, #some have higher influence, but don't look concerning
              id.method = "identify", 
              main = "Influence Plot", 
              sub = "Circle size is proportial to Cook's Distance" )
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter

##       StudRes        Hat      CookD
## 119 -1.173946 0.05256924 0.01272843
## 163 -3.136846 0.01891262 0.03069723
## 188  3.604136 0.01625254 0.03437512
## 287 -1.378551 0.05741502 0.01923448
#leverage: high lev point would also have hig cookd, influential point and might consider rerunning to see if results change
leveragePlots(model.slope) #188 seems out there

leveragePlots(model.int) #192, 287, 43 too

#look at var of residuals B0 or B1 as funciton of A, B or C
qqPlot(model.slope$residuals) # 188

## [1] 188 163
qqPlot(model.int$residuals) # 192 & 43

## [1] 192  43
qqPlot(model.slope$residuals[s$group == 'A'], main = "slopes for A") #188

## 188 163 
##  62  49
qqPlot(model.slope$residuals[s$group == 'B'], main = "slopes for B") #looks ok

## 275 216 
##  81  66
qqPlot(model.slope$residuals[s$group == 'C'], main = "slopes for C") #152, 163

## 282 268 
## 103  95
qqPlot(model.int$residuals[s$group == 'A'], main = "intercepts for A") #192

## 192  43 
##  63  14
qqPlot(model.int$residuals[s$group == 'B'], main = "intercepts for B") #looks ok

##   8 274 
##   2  80
qqPlot(model.int$residuals[s$group == 'C'], main = "intercepts for C") #looks ok

## 260  62 
##  93  26

ANSWER

It seems like number of friends sets up the baseline for individuals at the intercept, but doesn’t influence over time, maybe it would have been predictive if we had more than one time point for “number of close friends who you can confide in and who you talk to at least monthly” because this may be a confound with the interventions (maybe you find a good group of friends to be around outside of the intervention which is in-part causing the increase in EF in addition to the intervention).

some suggest that we should log transform friends and the model was singular, suggesting there isn’t enough variance in number of friends to log transform that predictor?

Nonlinearity: friends intercept model seemed non-linear, but slope model was ok.

Normality: normality of the outcome, in particular the residuals is an assumption. Though, this one is least important, unless you have N < 30. So, I’m not too worried about this because big enough sample! I also looked at the plots, and the outcomes (slope, intercept) do seem pretty normal!

Homoscedascity: I plotted studentized residuals vs. fitted values and it looks like it might be good to power transforma the data by 1.9 for intercept model.

Independence of error: We can’t know this, but no reason to think the independence of residuals is violated here.

Influential observations: No outliers of concern, no large Cook Distance values, and there were some higher leverage values (subjects 188, 192, 287, and 43), but I ran the model without these people and there are no differences in results, so I’m leaving them in!


e)

show the empirical 95% CI for the effects of the interventions on cognitive decline, based on 1000 bootstrapped iterations.

set.seed(1)
bs = 1000

B.slope <- vector(length = bs)
C.slope <- vector(length = bs)

for (i in 1:bs){
  newdat <- s[sample(1:nrow(s), nrow(s), replace = T),]
  coefs <- coefficients(lm(slope ~ B_1 + C_1, data = newdat, na.action = na.exclude))[2:3]
  B.slope[i] <- coefs[1]
  C.slope[i] <- coefs[2]
}

B.slope <- sort(B.slope)
print(paste("Group B 95% CI: [", round(B.slope[[25]], 3), ",", round(B.slope[[976]],3),"]"), sep = "")
## [1] "Group B 95% CI: [ 0.033 , 0.099 ]"
C.slope <- sort(C.slope)
print(paste("Group C 95% CI: [", round(C.slope[[25]], 3), ",", round(C.slope[[976]],3),"]"), sep = "")
## [1] "Group C 95% CI: [ 0.069 , 0.124 ]"

f)

provide the empirical 2-tailed p-values for the two intervention effects using 1000 permutations. Note that this is an example of permuted multiple regressions, so you’ll need to get the p-values twice, once for each effect(a total of 2000 permutations), permuted multiple regressions using the residual approach, as we discussed in the bootstrapping/permutation part of the class.

# 2-tailed p-values for group B & C. slopes w/ 1000 perm 
# get the p-values 2x (slope of B, slope for C) 
# total = 2000 permutations

#place NAs in AvBC and BvC column to matches missing data in slopes column
for(i in 1:nrow(s)){
  if(rowSums(is.na(s[i,5:54])) > 5){ 
    s$AvBC[i] = NA
    s$BvC[i] = NA
  }
}

#in multiple regression, exchangability is violated if the variables of interest (X_p) is simply permuted because it destroys the inter-relationship between the X variables
#Solution: find the significance of partial slope of X_p, first residualize Y and X_p (controlling for other variables), then permute resid(X_p) and resid(Y) as in simple regression

#run models of interest
m.b1 <- lm(slope ~ BvC + MvF + friends.c + age.c, data = s)
m.AvBC <- lm(AvBC ~ BvC + MvF + friends.c + age.c, data = s)
m.BvC <- lm(BvC ~ AvBC + MvF + friends.c + age.c, data = s)

#assign model residuals to new DFs
rm.b1 <- cbind(m.b1$residuals)
n1 <- length(rm.b1)

rm.AvBC <- cbind(m.AvBC$residuals)
n2 <- length(rm.AvBC)

rm.BvC <- cbind(m.BvC$residuals)
n3 <- length(rm.BvC)

#number of iterations
bs <- 1000

set.seed(1234)

#create new vectors for permutation of B and C groups
rm.vec.AvBC <- vector(length = bs)
rm.vec.BvC <- vector(length = bs)

#permutation
for (i in 1:bs){
  newdat <- data.frame(y = rm.b1[sample(1:n1, n1, replace = T), 1],
                       x = rm.AvBC[sample(1:n2, n2, replace = T), 1],
                       z = rm.BvC[sample(1:n3, n3, replace = T), 1])
  rm.vec.AvBC[i] <- coefficients(lm(y ~ x, data = newdat))[2] #resid(b1) ~ resid(AvBC)
  rm.vec.BvC[i] <- coefficients(lm(y ~ z, data = newdat))[2] #resid(b1) ~ resid(BvC)
  }

#get the original model slope
coef <- coefficients(lm(slope ~ AvBC + BvC + MvF + age.c + friends.c, data = s))
c.BvC <- coef[3] #BvC
c.AvBC <- coef[2] #AvBC

#calculate p for AvBC
pAvBC <- sum(abs(rm.vec.AvBC) > abs(c.AvBC))/length(rm.vec.AvBC) #p < .001; this is the p-value associated with our null

#calculate p for BvC
pBvC <- sum(abs(rm.vec.BvC) > abs(c.BvC))/length(rm.vec.BvC)  #.083

print(paste("for AvBC, p = ", round(pAvBC, 3), sep = ""))
## [1] "for AvBC, p = 0"
print(paste("for BvC, p = ", round(pBvC, 3), sep = ""))
## [1] "for BvC, p = 0.08"

3) SIMULATION IN R

a)

After you publish your findings from the above analysis in #2 in a top journal in your field, you receive criticism from the twitterverse that you did not control for non-random dropout. In particular, a group of colleagues (reasonably) think that people with the most cognitive decline may have been more liable to drop out of the study, and that this may have been especially the case for people in group A (people with decline in groups B & C were doing more fun activities and so may have stayed in despite cognitive decline). In essence, critiques are concerned about a dropout ~ condition x cognitive decline interaction, such that only those in group A dropped out if they had cognitive decline.

group A definitely

Modify the simulation syntax for the problem provided above in order to conduct a fake data simulation of a worse case scenario, where all those who dropped out of group A were those who had the most severe expected cognitive decline. E.g., if 9 people in group A dropped out, then simulate it again such that the 9 with the most severe expected cognitive decline in group A drop out. Then present the main results of the slopes ~ condition + other variables to see what influence this would have even under the worst case. Note that in the simulation above, we simulated truly random dropout. You just need to change that assumption, create new data, and see what the effect is. This is an example of “sensitivity analysis” - seeing what our results would have looked like under a violation of an assumption.

HINT: concern yourself only with the expected cognitive decline (each individual’s true slope), not the actual observed cognitive decline (which includes noise). That’s because the expected cognitive decline is what you would find, on average, if you redid this 1000s of times, and THAT is more relevant than what you’d find in any given single analysis, which depends more on the randomness of the particular sample you happened to draw

set.seed(321)

beta.a <- mvrnorm(115,c(103,-.035),matrix(c(14,-.1,-.1,.005),nrow=2,byrow=T))
beta.b <- mvrnorm(102,c(103,.005),matrix(c(14,-.1,-.1,.005),nrow=2,byrow=T))
beta.c <- mvrnorm(118,c(103,.035),matrix(c(14,-.1,-.1,.003),nrow=2,byrow=T))

##############################
# prep for group A
##############################

cond <- c(rep("A",115))
gender <- sample(c("M","F"),length(cond),replace=TRUE)
age <- sample(65:75,length(cond),replace=TRUE)
friends <- floor(rgamma(length(cond),shape=1.5,scale=2) + (gender=='F')*runif(length(cond),0,2))
id <- paste("SUB",sample(10000:99999,length(cond),replace=FALSE),sep='')
beta.a[,2] <- beta.a[,2] + (gender=='M' & cond=='B')*.035 #creating gender*cond int
beta.a[,2] <- beta.a[,2] + (gender=='F' & cond=='C')*.015 #creating gender*cond int

datA <- cbind.data.frame(id,cond,gender,age,friends,beta.a)

names(datA)[6:7] <- c('intcpt1','slope1')
datA$intcpt <- datA$intcpt1 + (gender=='F')*1 - (age-mean(age))*.2 + friends*.45 #expected starting EF

datA$slope <- datA$slope1 - (gender=='M')*.0005 - (age-mean(age))*.004 #expected cognitive decline

datA <- datA[order(datA$slope),] #order slopes from highest to lowest

## loop including dropping lowest 9 slopes
for (i in 1:nrow(datA)){

  score <- datA[i,8] + datA[i,9]*1:50 + 8*rnorm(50)
  ro <- runif(50) < .02
  score[ro] <- score[ro] + rnorm(1, 0, 15) #creating outliers
  score[runif(50) < .06] <- NA #creating random missingness
  
  if (i > 9){ #don't write out the first 9 subjects with the largest slopes
    write.table(score, file = paste('X', substr(datA[i,1], 4, 8), datA[i,'cond'], sep = '.'), 
                quote = FALSE, row.names = FALSE, col.names = FALSE)
    }
}


##############################
#prep for group B & C
##############################

betaBC <- rbind(beta.b, beta.c) 

cond <- c(rep("B",102),rep("C",118))
gender <- sample(c("M","F"),length(cond),replace=TRUE)
age <- sample(65:75,length(cond),replace=TRUE)
friends <- floor(rgamma(length(cond),shape=1.5,scale=2) + (gender=='F')*runif(length(cond),0,2))
id <- paste("SUB",sample(10000:99999,length(cond),replace=FALSE),sep='')
betaBC[,2] <- betaBC[,2] + (gender == 'M' & cond == 'B')*.035 #creating gender*cond int
betaBC[,2] <- betaBC[,2] + (gender == 'F' & cond == 'C')*.015 #creating gender*cond int

datBC <- cbind.data.frame(id, cond, gender, age, friends, betaBC)

names(datBC)[6:7] <- c('intcpt1','slope1')
datBC$intcpt <- datBC$intcpt1 + (gender == 'F')*1 - (age -mean(age))*.2 + friends*.45 #expected starting EF

datBC$slope <- datBC$slope1 - (gender == 'M')*.0005 - (age -mean(age))*.004 #expected cognitive decline

## GROUP B & C KEEP OLD APPROACH
for (i in 1:nrow(datBC)){
 score <- datBC[i,8] + datBC[i,9]*1:50 + 8*rnorm(50) 
 ro <- runif(50) < .02
 score[ro] <- score[ro] + rnorm(1,0,15) #creating outliers
 score[runif(50)<.06] <- NA #creating random missingness
 if (runif(1) > .09){ #this is simulating random dropout of ~9% of subjects
 write.table(score,file=paste('X',substr(datBC[i,1],4,8),datBC[i,'cond'],sep='.'),
             quote=FALSE,row.names=FALSE,col.names=FALSE)}
}

########################################

dat <- rbind.data.frame(datA, datBC)

summary(lm(intcpt ~ cond + gender + age + friends, data = dat)) #just checking what the true model is for starting EF, without noise
## 
## Call:
## lm(formula = intcpt ~ cond + gender + age + friends, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5437 -2.4143  0.0813  2.6285 10.2390 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 122.52090    4.56931  26.814  < 2e-16 ***
## condB        -0.08269    0.51189  -0.162    0.872    
## condC         0.38788    0.49674   0.781    0.435    
## genderM      -0.69205    0.42488  -1.629    0.104    
## age          -0.26559    0.06473  -4.103 5.15e-05 ***
## friends       0.38511    0.08568   4.495 9.65e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.743 on 329 degrees of freedom
## Multiple R-squared:  0.1169, Adjusted R-squared:  0.1034 
## F-statistic: 8.706 on 5 and 329 DF,  p-value: 9.095e-08
summary(lm(slope ~ cond + gender + age + friends + cond * gender,data = dat)) #just checking what the true model is for cognitive decline, without noise
## 
## Call:
## lm(formula = slope ~ cond + gender + age + friends + cond * gender, 
##     data = dat)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.194273 -0.049434  0.000146  0.038582  0.209285 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.1454760  0.0807172   1.802  0.07242 .  
## condB          0.0399540  0.0133328   2.997  0.00294 ** 
## condC          0.0951711  0.0127892   7.442 8.87e-13 ***
## genderM        0.0071858  0.0126432   0.568  0.57019    
## age           -0.0027446  0.0011471  -2.393  0.01729 *  
## friends        0.0009124  0.0015165   0.602  0.54782    
## condB:genderM  0.0191505  0.0182360   1.050  0.29443    
## condC:genderM -0.0256208  0.0176151  -1.454  0.14677    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06611 on 327 degrees of freedom
## Multiple R-squared:  0.2368, Adjusted R-squared:  0.2205 
## F-statistic:  14.5 on 7 and 327 DF,  p-value: < 2.2e-16
summary(dat)
##       id                cond              gender               age       
##  Length:335         Length:335         Length:335         Min.   :65.00  
##  Class :character   Class :character   Class :character   1st Qu.:67.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :70.00  
##                                                           Mean   :69.93  
##                                                           3rd Qu.:73.00  
##                                                           Max.   :75.00  
##     friends          intcpt1           slope1              intcpt      
##  Min.   : 0.000   Min.   : 92.46   Min.   :-0.222533   Min.   : 94.63  
##  1st Qu.: 1.000   1st Qu.:100.58   1st Qu.:-0.044668   1st Qu.:102.08  
##  Median : 3.000   Median :102.95   Median : 0.006245   Median :104.95  
##  Mean   : 3.012   Mean   :103.02   Mean   : 0.004659   Mean   :104.87  
##  3rd Qu.: 4.000   3rd Qu.:105.79   3rd Qu.: 0.055538   3rd Qu.:107.79  
##  Max.   :18.000   Max.   :113.47   Max.   : 0.232656   Max.   :114.31  
##      slope          
##  Min.   :-0.223589  
##  1st Qu.:-0.040942  
##  Median : 0.004378  
##  Mean   : 0.004404  
##  3rd Qu.: 0.054899  
##  Max.   : 0.228011
setwd("C:/Users/Dani Grant/Dropbox/graduate school records/fall 2021/R programming/FinalData")

write.table(dat[,c(1,3,4,5)], 'SubjectInfo_end.txt', quote = FALSE, row.names = FALSE, col.names = TRUE)