Question 1
Using a loop, print the integers from 1 to 50. (Hint, use the print() function).
for(i in 1:50) {print(i)}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
## [1] 41
## [1] 42
## [1] 43
## [1] 44
## [1] 45
## [1] 46
## [1] 47
## [1] 48
## [1] 49
## [1] 50
Question 2.
j <- 0
for(i in 0:1000) {
j <- j + i
}
j
## [1] 500500
j <- 0
for(i in seq(0, 1000, 2)) {
j <- j + i
}
j
## [1] 250500
sum(0:1000)
## [1] 500500
sum(seq(0, 1000, 2))
## [1] 250500
Question 3
Here is a dataframe of survey data containing 5 questions I collected from 6 participants:
survey <- data.frame(
"participant" = c(1, 2, 3, 4, 5, 6),
"q1" = c(5, 3, 2, 7, 11, 0),
"q2" = c(4, 2, 2, 5, -10, 99),
"q3" = c(-4, -3, 4, 2, 9, 10),
"q4" = c(-30, 5, 2, 23, 4, 2),
"q5" = c(88, 4, -20, 2, 4, 2)
)
The response to each question should be an integer between 1 and 5. Obviously, we have some bad values in the dataframe. Let’s fix them.
Create a new object called survey.clean by assigning the original dataset to survey.clean.
Set the loop index to i.
Set the loop index.values to the vector of data columns.
In the loop code, assign the ith column of data to a new vector called data.temp.
Convert all invalid values in data.temp to NA (hint: use )
Assign data.temp back to the ith column of survey.clean.
Close the loop and let it run!
survey.clean <- survey
for(i in 2:ncol(survey.clean)) {
data.temp <- survey.clean[,i]
data.temp[(data.temp %in% 1:5) == F] <- NA
survey.clean[,i] <- data.temp
}
survey.clean
## participant q1 q2 q3 q4 q5
## 1 1 5 4 NA NA NA
## 2 2 3 2 NA 5 4
## 3 3 2 2 4 2 NA
## 4 4 NA 5 2 NA 2
## 5 5 NA NA NA 4 4
## 6 6 NA NA NA 2 2
Hint: Use the following steps
Assign the new vector invalid.answers to the dataframe containing all NA values.
Create a loop over the rows of the dataframe.
Assign the data for the ith row to a new vector called part.i
Calculate how many of the values in part.i are NA (use is.na())
Assign the result to the ith row in invalid.answers
survey.clean$invalid.answers <- NA
for(row.i in 1:nrow(survey.clean)) {
data.temp <- survey.clean[row.i,]
n.na <- sum(is.na(data.temp)) - 1 # Have to subtact 1 for the last NA
survey.clean$invalid.answers[row.i] <- n.na
}
survey.clean
## participant q1 q2 q3 q4 q5 invalid.answers
## 1 1 5 4 NA NA NA 3
## 2 2 3 2 NA 5 4 1
## 3 3 2 2 4 2 NA 1
## 4 4 NA 5 2 NA 2 2
## 5 5 NA NA NA 4 4 3
## 6 6 NA NA NA 2 2 3
Question 4
Standardizing a variable means subtracting the mean, and then dividing by the standard deviation. Let’s use a loop to standardize the numeric columns in the pirates dataset. You can access this dataset in the yarrr package, or by downloading it from http://nathanieldphillips.com/wp-content/uploads/2016/01/pirates.txt
standardize.me <- function(x) {
output <- (x - mean(x)) / sd(x)
return(output)
}
library(yarrr)
# I looked up the numeric columns by eye. This is a pretty lazy way to do it.
num.columns <- c(4, 6, 7, 8, 11, 13)
pirates.z <- pirates[,num.columns]
for(col.i in 1:ncol(pirates.z)) {
pirates.z[,col.i] <- standardize.me(pirates.z[,col.i])
}
head(pirates.z)
## age tattoos tchests parrots sword.time beard.length
## 1 0.4705692 0.4837266 1.8316209 1.6147531 -0.16245384 -0.9591265
## 2 -0.4326347 1.6649880 -0.1670301 0.1411021 -0.25404809 0.6028098
## 3 -0.4326347 0.7790419 -0.3097909 -0.2273107 -0.09261323 1.0909148
## 4 0.2899284 0.7790419 -1.0235949 -0.5957234 -0.23687417 1.0909148
## 5 0.6512100 2.2556187 0.5467738 2.7199914 -0.18420748 -0.9591265
## 6 0.4705692 0.7790419 -0.7380733 -0.2273107 1.33396715 1.1885359
for(col.i in 1:ncol(pirates.z)) {
print(paste("The mean of column ", col.i, " is ", round(mean(pirates.z[,col.i]), 2),
" and the standard deviation is ",
round(sd(pirates.z[,col.i]), 2), sep = ""))
}
## [1] "The mean of column 1 is 0 and the standard deviation is 1"
## [1] "The mean of column 2 is 0 and the standard deviation is 1"
## [1] "The mean of column 3 is 0 and the standard deviation is 1"
## [1] "The mean of column 4 is 0 and the standard deviation is 1"
## [1] "The mean of column 5 is 0 and the standard deviation is 1"
## [1] "The mean of column 6 is 0 and the standard deviation is 1"
For this question we’ll use the auction dataset in the yarrr package. This dataset shows the selling prices of 1,000 pirate ships sold at an auction. If you can’t access the yarrr package, you can download the dataset using this link: “http://nathanieldphillips.com/wp-content/uploads/2016/01/auction.txt”
Question 5
result.df <- data.frame("cannons" = unique(auction$cannons),
"mean.price" = NA,
stringsAsFactors = F
)
for(row.i in 1:nrow(result.df)) {
cannons.i <- result.df$cannons[row.i]
mean.price.i <- mean(subset(auction, cannons == cannons.i)$price)
result.df$mean.price[row.i] <- mean.price.i
}
result.df
## cannons mean.price
## 1 16 1254.7119
## 2 10 739.4052
## 3 12 832.2190
## 4 6 450.7007
## 5 14 1022.2750
## 6 4 226.6176
## 7 8 566.2791
## 8 2 273.0686
## 9 20 1423.1250
## 10 18 1426.8649
aggregate(price ~ cannons,
data = auction,
FUN = mean)
## cannons price
## 1 2 273.0686
## 2 4 226.6176
## 3 6 450.7007
## 4 8 566.2791
## 5 10 739.4052
## 6 12 832.2190
## 7 14 1022.2750
## 8 16 1254.7119
## 9 18 1426.8649
## 10 20 1423.1250
Question 6
Using a loop, create 10 histograms showing the selling prices of ships with conditions of 1, 2, 3, …10. (Put them all in one plot by using par(mfrow())). In the main title of each plot, make sure to indicate which condition value is being plotted. Also, include a vertical line showing the mean selling prices of all ships in the plot.
par(mfrow = c(3, 4))
for(condition.i in 1:10) {
data.i <- subset(auction, condition == condition.i)$price
hist(data.i, main = paste("condition = ", condition.i))
abline(v = mean(data.i),
col = "blue",
lwd = 3)
}
Question 7
Have you heard of the term “p-hacking”? Unfortunately it has nothing to do with pirates. It describes how some researchers will conduct as many tests as they can in order to get a test with a p-value less than .05 (which they can then say they predicted all along!). Here’s how easy it is to p-hack using a loop.
height <- round(rnorm(100, mean = 170, sd = 10), 0)
hist(height)
survey <- matrix(rnorm(n = 100 * 100, mean = 0, sd = 1),
nrow = 100, ncol = 100
)
p.values <- rep(NA, 100)
for(i in 1:100) {
test.result <- cor.test(height, survey[,i])
p.values[i] <- test.result$p.value
}
hist(p.values)
sum(p.values < .05)
## [1] 5
sig.variables <- which(p.values < .05)
par(mfrow = c(3, 3))
for(i in sig.variables) {
plot(survey[,i], height)
data.temp <- data.frame("height" = height,
"x" = survey[,i]
)
abline(lm(height ~ x, data = data.temp), col = "red")
}
## Oops, it's a correlation not a t-test. We'll create a new cortest.apa function...
cortest.apa <- function(x, y, null, p.critical) {
test.result <- cor.test(x, y)
test.statistic <- round(test.result$statistic, 2)
df <- round(test.result$parameter, 2)
p.value <- round(test.result$p.value, 2)
if (p.value <= p.critical) {
output <- paste("A correlation test was significant (t(", df, ") = ", test.statistic, ", p = ", p.value, "). We reject the null hypothesis that the true correlation is ", null, sep = "")
}
if (p.value > p.critical) {
output <- paste("A correlation test was non-significant (t(", df, ") = ", test.statistic, ", p = ", p.value, "). We fail to reject the null hypothesis that the true correlation ", null, sep = "")
}
return(output)
}
for (sig.var.i in sig.variables) {
print(cortest.apa(survey[,sig.var.i], height, null = 0, p.critical = .05))
}
## [1] "A correlation test was significant (t(98) = -2.93, p = 0). We reject the null hypothesis that the true correlation is 0"
## [1] "A correlation test was significant (t(98) = -2.32, p = 0.02). We reject the null hypothesis that the true correlation is 0"
## [1] "A correlation test was significant (t(98) = 2.07, p = 0.04). We reject the null hypothesis that the true correlation is 0"
## [1] "A correlation test was significant (t(98) = -2.35, p = 0.02). We reject the null hypothesis that the true correlation is 0"
## [1] "A correlation test was significant (t(98) = -2.27, p = 0.03). We reject the null hypothesis that the true correlation is 0"