1) Try using the help function for the following package: ddply Hint: that will require you to load the package, install it if its not installed, and use ? or help()
2) Import the reshape2 package and load the example dataset “smiths”
library(reshape2)
data(smiths)
3) Inspect the structure (str) and columns of the smiths dataset. Let's print the data frame too.
str(smiths)
## 'data.frame': 2 obs. of 5 variables:
## $ subject: Factor w/ 2 levels "John Smith","Mary Smith": 1 2
## $ time : int 1 1
## $ age : num 33 NA
## $ weight : num 90 NA
## $ height : num 1.87 1.54
colnames(smiths)
## [1] "subject" "time" "age" "weight" "height"
smiths
## subject time age weight height
## 1 John Smith 1 33 90 1.87
## 2 Mary Smith 1 NA NA 1.54
4) Use na.omit to remove the row with NA
na.omit(smiths)
## subject time age weight height
## 1 John Smith 1 33 90 1.87
6) Take the mean of height using mean()
mean(smiths$height)
## [1] 1.705
7) Try using summarize to do the same thing
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
summarise(smiths, mean_height = mean(height))
## mean_height
## 1 1.705
8) Using both time and subject as identifying variables, change the format to long
mlt_smith <- melt(smiths, id.vars = c("subject", "time"))
9) Load trustData.csv and qc.csv and merge them. Try doing it automatically and by specifying “sub” as the ID variable
trust_data <- read.csv("trustData.csv")
qc <- read.csv("qcTrust.csv")
head(inner_join(trust_data, qc, by = c("sub")))
## sub condition value delay choice age gender
## 1 1 N 11 4 1 27 male
## 2 1 N 18 90 1 27 male
## 3 1 N 14 4 0 27 male
## 4 1 N 14 7 0 27 male
## 5 1 N 30 7 0 27 male
## 6 1 N 11 90 1 27 male
joined_data <- inner_join(trust_data, qc)
## Joining by: "sub"
10) Subset the joined data in order to take the mean choice of only female trials
female_data <- filter(joined_data, gender == "female")
mean(female_data$choice)
## [1] 0.7347
11) Use dcast to see how many trials for each delay you have by gender with margins
dcast(joined_data, gender ~ delay, margins = T)
## Aggregation function missing: defaulting to length
## gender 4 7 14 21 42 90 150 (all)
## 1 female 7 7 7 7 7 7 7 49
## 2 male 49 49 44 49 49 49 49 338
## 3 <NA> 14 14 13 14 14 14 14 97
## 4 (all) 70 70 64 70 70 70 70 484
13) Load the french fries data and inspect it using various methods
data(french_fries)
head(french_fries)
## time treatment subject rep potato buttery grassy rancid painty
## 61 1 1 3 1 2.9 0.0 0.0 0.0 5.5
## 25 1 1 3 2 14.0 0.0 0.0 1.1 0.0
## 62 1 1 10 1 11.0 6.4 0.0 0.0 0.0
## 26 1 1 10 2 9.9 5.9 2.9 2.2 0.0
## 63 1 1 15 1 1.2 0.1 0.0 1.1 5.1
## 27 1 1 15 2 8.8 3.0 3.6 1.5 2.3
14) Get into long format (Hint: try messing around with the id vars and value name and see what happens)
m_fries <- melt(french_fries, id.vars = c("time", "treatment", "subject", "rep"),
value.name = "rating", variable.name = "type")
Try the following excercises with and without chaining:
16) Use group_by and summarise to get the mean rating at each time point
m_fries %.% group_by(time) %.% summarise(mean_rating = mean(rating, na.rm = T))
## Source: local data frame [10 x 2]
##
## time mean_rating
## 1 10 3.877
## 2 9 3.216
## 3 8 3.049
## 4 7 2.910
## 5 6 3.103
## 6 5 3.030
## 7 4 3.046
## 8 3 3.135
## 9 2 3.251
## 10 1 3.149
17) Let's also get the SD and the number of trials in each Hint: the length() function tells you the length of a vector and na.omit removes them
m_fries %.% group_by(time) %.% summarise(mean_rating = mean(rating, na.rm = T),
SD = sd(rating, na.r = T), N = n())
## Source: local data frame [10 x 4]
##
## time mean_rating SD N
## 1 10 3.877 4.061 300
## 2 9 3.216 3.898 300
## 3 8 3.049 3.787 360
## 4 7 2.910 3.532 360
## 5 6 3.103 3.655 360
## 6 5 3.030 3.626 360
## 7 4 3.046 3.625 360
## 8 3 3.135 3.636 360
## 9 2 3.251 3.733 360
## 10 1 3.149 3.756 360
18) Use transform to scale the ratings column by time period
m_fries <- m_fries %.% group_by(time) %.% transform(scale_rating = scale(rating))
19) To check that scaling worked correctly, take the mean of the scale rating by time point again
m_fries %.% group_by(time) %.% summarise(mean(scale_rating, na.rm = T))
## Source: local data frame [10 x 2]
##
## time mean(scale_rating, na.rm = T)
## 1 10 0.191202
## 2 9 0.013890
## 3 8 -0.030902
## 4 7 -0.068261
## 5 6 -0.016481
## 6 5 -0.035966
## 7 4 -0.031607
## 8 3 -0.007763
## 9 2 0.023234
## 10 1 -0.004112
As you can see, they are all near zero because they were scaled!
20) One last problem: Filter subjects that had between 30-70 % later (or 1) choices.
Hint: use group_by and summarise to calculae if subject is “valid” Then anti_join back to original data
valid_subs <- trust_data %.% group_by(sub) %.% summarise(mean_choice = mean(choice)) %.%
filter(mean_choice > 0.3, mean_choice < 0.7)
anti_join(trust_data, valid_subs)
## Joining by: "sub"
## sub condition value delay choice
## 1 4 N 18 90 0
## 2 4 N 22 14 1
## 3 4 N 14 4 1
## 4 4 N 14 7 1
## 5 4 N 11 7 1
## 6 4 N 30 7 1
## 7 4 N 34 7 1
## 8 4 N 11 90 0
## 9 4 N 18 4 1
## 10 4 N 26 21 1
## 11 4 N 26 7 1
## 12 4 N 18 14 1
## 13 4 N 34 150 0
## 14 4 N 22 7 1
## 15 4 N 14 21 1
## 16 4 N 30 42 1
## 17 4 N 22 150 1
## 18 4 N 18 42 0
## 19 4 N 30 90 1
## 20 4 N 22 42 1
## 21 4 N 14 42 1
## 22 4 N 18 7 1
## 23 4 N 34 42 1
## 24 4 N 11 150 1
## 25 4 N 11 14 1
## 26 4 N 26 150 1
## 27 4 N 22 90 1
## 28 4 N 30 14 0
## 29 4 N 14 90 1
## 30 4 N 14 150 1
## 31 4 N 34 90 1
## 32 4 N 26 14 1
## 33 4 N 11 21 0
## 34 4 N 18 150 1
## 35 4 N 14 14 1
## 36 4 N 22 21 1
## 37 4 N 26 4 1
## 38 4 N 30 21 1
## 39 4 N 18 21 1
## 40 4 U 11 4 0
## 41 4 U 11 42 0
## 42 4 U 30 4 0
## 43 4 U 26 90 0
## 44 4 U 30 150 0
## 45 4 U 26 42 0
## 46 4 U 34 21 0
## 47 4 U 22 4 1
## 48 4 U 34 4 1
## 49 2 N 11 4 1
## 50 2 N 18 90 1
## 51 2 N 22 14 1
## 52 2 N 14 4 1
## 53 2 N 11 7 1
## 54 2 N 30 7 1
## 55 2 N 34 7 1
## 56 2 N 26 21 1
## 57 2 N 26 7 1
## 58 2 N 18 14 1
## 59 2 N 30 4 1
## 60 2 N 30 150 1
## 61 2 N 34 150 1
## 62 2 N 22 7 1
## 63 2 N 30 42 1
## 64 2 N 22 150 0
## 65 2 N 30 90 1
## 66 2 N 22 42 1
## 67 2 N 14 42 0
## 68 2 N 18 7 1
## 69 2 N 34 42 1
## 70 2 N 11 150 0
## 71 2 N 26 150 0
## 72 2 N 22 90 1
## 73 2 N 30 14 1
## 74 2 N 14 90 0
## 75 2 N 34 90 1
## 76 2 N 26 14 1
## 77 2 N 18 150 0
## 78 2 N 14 14 1
## 79 2 N 22 21 1
## 80 2 N 26 4 1
## 81 2 N 30 21 1
## 82 2 N 18 21 1
## 83 2 N 26 42 1
## 84 2 N 34 21 1
## 85 2 N 22 4 1
## 86 2 N 34 4 1
## 87 2 U 14 7 1
## 88 2 U 11 90 0
## 89 2 U 18 4 1
## 90 2 U 11 42 0
## 91 2 U 26 90 1
## 92 2 U 14 21 0
## 93 2 U 18 42 0
## 94 2 U 11 14 0
## 95 2 U 14 150 0
## 96 2 U 11 21 1
## 97 1 N 11 4 1
## 98 1 N 18 90 1
## 99 1 N 14 4 0
## 100 1 N 14 7 0
## 101 1 N 30 7 0
## 102 1 N 11 90 1
## 103 1 N 18 4 1
## 104 1 N 26 21 1
## 105 1 N 26 7 0
## 106 1 N 18 14 1
## 107 1 N 30 4 1
## 108 1 N 26 90 1
## 109 1 N 30 150 0
## 110 1 N 22 7 1
## 111 1 N 14 21 1
## 112 1 N 30 42 1
## 113 1 N 22 150 1
## 114 1 N 22 42 1
## 115 1 N 14 42 1
## 116 1 N 18 7 0
## 117 1 N 34 42 1
## 118 1 N 11 14 1
## 119 1 N 22 90 0
## 120 1 N 14 90 1
## 121 1 N 14 150 1
## 122 1 N 34 90 1
## 123 1 N 26 14 1
## 124 1 N 11 21 1
## 125 1 N 18 150 1
## 126 1 N 14 14 1
## 127 1 N 22 21 1
## 128 1 N 26 4 1
## 129 1 N 30 21 1
## 130 1 N 18 21 1
## 131 1 N 26 42 1
## 132 1 N 34 21 1
## 133 1 N 34 4 1
## 134 1 U 22 14 1
## 135 1 U 11 7 0
## 136 1 U 34 7 1
## 137 1 U 11 42 1
## 138 1 U 34 150 1
## 139 1 U 18 42 1
## 140 1 U 30 90 1
## 141 1 U 11 150 1
## 142 1 U 26 150 1
## 143 1 U 30 14 1
## 144 1 U 22 4 1