Edit the code chunks below and knit the document. Display your data
frames using glimpse() or print().
Load the following data from the reprores package (or access the
linked CSV files online). Each participant is identified by a unique
user_id.
library(readr)
disgust_scores <- read_csv("~/Desktop/disgust_scores.csv")
personality_scores <- read_csv("~/Desktop/personality_scores.csv")
users <- read_csv("~/Desktop/users.csv")
glimpse(disgust_scores)
## Rows: 20,000
## Columns: 6
## $ id <dbl> 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1…
## $ user_id <dbl> 1, 155324, 155366, 155370, 155386, 155409, 155427, 155425, 15…
## $ date <date> 2008-07-10, 2008-07-11, 2008-07-12, 2008-07-12, 2008-07-12, …
## $ moral <dbl> 1.428571, 3.000000, 5.571429, 5.714286, 1.428571, 4.142857, 3…
## $ pathogen <dbl> 2.714286, 2.571429, 4.000000, 4.857143, 3.857143, 4.142857, 5…
## $ sexual <dbl> 1.7142857, 1.8571429, 0.4285714, 4.7142857, 3.7142857, 1.5714…
glimpse(personality_scores)
## Rows: 15,000
## Columns: 7
## $ user_id <dbl> 0, 1, 2, 5, 8, 108, 233, 298, 426, 436, 685, 807, 871, 881, 94…
## $ date <date> 2006-03-23, 2006-02-08, 2005-10-24, 2005-12-07, 2006-07-27, 2…
## $ Ag <dbl> 1.833333, 2.571429, 2.714286, 1.714286, 3.285714, 4.285714, 4.…
## $ Co <dbl> 2.700000, 3.000000, 2.800000, 3.400000, 4.100000, 3.900000, 3.…
## $ Ex <dbl> 2.777778, 2.666667, 2.555556, 3.888889, 2.555556, 3.222222, 3.…
## $ Ne <dbl> 1.857143, 2.250000, 2.500000, 2.375000, 1.000000, 1.375000, 3.…
## $ Op <dbl> 2.857143, 4.285714, 3.857143, 4.142857, 5.571429, 3.714286, 4.…
glimpse(users)
## Rows: 52,043
## Columns: 3
## $ user_id <dbl> 0, 1, 2, 5, 8, 9, 10, 17, 19, 20, 21, 22, 23, 24, 27, 30, 31…
## $ sex <chr> NA, "female", "male", "male", "male", "male", "female", "fem…
## $ birthyear <dbl> NA, 1976, 1985, 1980, 1968, 1972, 1978, 1981, 1980, 1964, 19…
Combine the users data to the complete
disgust_scores table.
study1 <- left_join(disgust_scores, users, by = "user_id")
glimpse(study1)
## Rows: 20,000
## Columns: 8
## $ id <dbl> 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, …
## $ user_id <dbl> 1, 155324, 155366, 155370, 155386, 155409, 155427, 155425, 1…
## $ date <date> 2008-07-10, 2008-07-11, 2008-07-12, 2008-07-12, 2008-07-12,…
## $ moral <dbl> 1.428571, 3.000000, 5.571429, 5.714286, 1.428571, 4.142857, …
## $ pathogen <dbl> 2.714286, 2.571429, 4.000000, 4.857143, 3.857143, 4.142857, …
## $ sexual <dbl> 1.7142857, 1.8571429, 0.4285714, 4.7142857, 3.7142857, 1.571…
## $ sex <chr> "female", "female", "male", "female", "male", "male", "femal…
## $ birthyear <dbl> 1976, 1984, 1982, 1968, 1983, 1983, 1987, 1978, 1986, 1970, …
Combine the users data to the complete
disgust_scores data, but have the columns from the
users table appear first in the data frame.
study2 <- left_join(users, disgust_scores, by = "user_id")
glimpse(study2)
## Rows: 52,043
## Columns: 8
## $ user_id <dbl> 0, 1, 2, 5, 8, 9, 10, 17, 19, 20, 21, 22, 23, 24, 27, 30, 31…
## $ sex <chr> NA, "female", "male", "male", "male", "male", "female", "fem…
## $ birthyear <dbl> NA, 1976, 1985, 1980, 1968, 1972, 1978, 1981, 1980, 1964, 19…
## $ id <dbl> 1199, 1, 1599, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ date <date> 2008-10-07, 2008-07-10, 2008-10-27, NA, NA, NA, NA, NA, NA,…
## $ moral <dbl> 5.285714, 1.428571, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ pathogen <dbl> 4.714286, 2.714286, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ sexual <dbl> 2.142857, 1.714286, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
Create a table with only disgust_scores and
personality_scores data from the same user_id
collected on the same date.
study3 <- inner_join(disgust_scores, personality_scores,
by = c("user_id", "date"))
glimpse(study3)
## Rows: 555
## Columns: 11
## $ id <dbl> 3, 6, 17, 18, 21, 22, 24, 25, 32, 33, 34, 37, 39, 43, 44, 46,…
## $ user_id <dbl> 155324, 155386, 155567, 155571, 155665, 155682, 155712, 15576…
## $ date <date> 2008-07-11, 2008-07-12, 2008-07-14, 2008-07-14, 2008-07-15, …
## $ moral <dbl> 3.000000, 1.428571, 5.571429, 2.714286, 4.142857, 2.714286, 4…
## $ pathogen <dbl> 2.571429, 3.857143, 4.714286, 6.000000, 4.142857, 3.000000, 4…
## $ sexual <dbl> 1.8571429, 3.7142857, 2.5714286, 4.4285714, 3.4285714, 0.7142…
## $ Ag <dbl> 4.000000, 3.142857, 5.285714, 3.714286, 2.857143, 3.428571, 3…
## $ Co <dbl> 3.300000, 2.600000, 5.700000, 3.800000, 1.800000, 3.000000, 4…
## $ Ex <dbl> 4.8888889, 4.0000000, 3.8888889, 4.5555556, 4.6666667, 3.5555…
## $ Ne <dbl> 2.375000, 0.250000, 1.125000, 2.250000, 3.125000, 1.375000, 3…
## $ Op <dbl> 4.714286, 5.142857, 3.142857, 2.857143, 4.571429, 4.857143, 5…
Join data from the same user_id, regardless of
date. Does this give you the same data table as above?
study3_nodate <- inner_join(disgust_scores, personality_scores,
by = "user_id")
glimpse(study3_nodate)
## Rows: 677
## Columns: 12
## $ id <dbl> 1, 3, 6, 17, 18, 20, 21, 22, 24, 25, 32, 33, 34, 35, 36, 37, …
## $ user_id <dbl> 1, 155324, 155386, 155567, 155571, 124756, 155665, 155682, 15…
## $ date.x <date> 2008-07-10, 2008-07-11, 2008-07-12, 2008-07-14, 2008-07-14, …
## $ moral <dbl> 1.428571, 3.000000, 1.428571, 5.571429, 2.714286, 5.428571, 4…
## $ pathogen <dbl> 2.714286, 2.571429, 3.857143, 4.714286, 6.000000, 5.142857, 4…
## $ sexual <dbl> 1.7142857, 1.8571429, 3.7142857, 2.5714286, 4.4285714, 2.7142…
## $ date.y <date> 2006-02-08, 2008-07-11, 2008-07-12, 2008-07-14, 2008-07-14, …
## $ Ag <dbl> 2.571429, 4.000000, 3.142857, 5.285714, 3.714286, 4.857143, 2…
## $ Co <dbl> 3.000000, 3.300000, 2.600000, 5.700000, 3.800000, 3.800000, 1…
## $ Ex <dbl> 2.6666667, 4.8888889, 4.0000000, 3.8888889, 4.5555556, 2.1111…
## $ Ne <dbl> 2.250, 2.375, 0.250, 1.125, 2.250, 3.375, 3.125, 1.375, 3.375…
## $ Op <dbl> 4.285714, 4.714286, 5.142857, 3.142857, 2.857143, 5.285714, 4…
Create a table of the disgust_scores and
personality_scores data containing all of the data
from both tables.
study4 <- full_join(disgust_scores, personality_scores,
by = c("user_id", "date"))
glimpse(study4)
## Rows: 34,445
## Columns: 11
## $ id <dbl> 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1…
## $ user_id <dbl> 1, 155324, 155366, 155370, 155386, 155409, 155427, 155425, 15…
## $ date <date> 2008-07-10, 2008-07-11, 2008-07-12, 2008-07-12, 2008-07-12, …
## $ moral <dbl> 1.428571, 3.000000, 5.571429, 5.714286, 1.428571, 4.142857, 3…
## $ pathogen <dbl> 2.714286, 2.571429, 4.000000, 4.857143, 3.857143, 4.142857, 5…
## $ sexual <dbl> 1.7142857, 1.8571429, 0.4285714, 4.7142857, 3.7142857, 1.5714…
## $ Ag <dbl> NA, 4.000000, NA, NA, 3.142857, NA, NA, NA, NA, NA, NA, NA, N…
## $ Co <dbl> NA, 3.3, NA, NA, 2.6, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Ex <dbl> NA, 4.888889, NA, NA, 4.000000, NA, NA, NA, NA, NA, NA, NA, N…
## $ Ne <dbl> NA, 2.375, NA, NA, 0.250, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Op <dbl> NA, 4.714286, NA, NA, 5.142857, NA, NA, NA, NA, NA, NA, NA, N…
Create a table of just the data from the disgust_scores
table for users who completed the personality_scores
questionnaire that same day.
study5 <- semi_join(disgust_scores, personality_scores,
by = c("user_id", "date"))
glimpse(study5)
## Rows: 555
## Columns: 6
## $ id <dbl> 3, 6, 17, 18, 21, 22, 24, 25, 32, 33, 34, 37, 39, 43, 44, 46,…
## $ user_id <dbl> 155324, 155386, 155567, 155571, 155665, 155682, 155712, 15576…
## $ date <date> 2008-07-11, 2008-07-12, 2008-07-14, 2008-07-14, 2008-07-15, …
## $ moral <dbl> 3.000000, 1.428571, 5.571429, 2.714286, 4.142857, 2.714286, 4…
## $ pathogen <dbl> 2.571429, 3.857143, 4.714286, 6.000000, 4.142857, 3.000000, 4…
## $ sexual <dbl> 1.8571429, 3.7142857, 2.5714286, 4.4285714, 3.4285714, 0.7142…
Create a table of data from users who did not complete
either the personality_scores questionnaire or the
disgust_scores questionnaire. (Hint: this will require
two steps.)
# Step 1: remove users who completed personality_scores
no_personality <- anti_join(users, personality_scores, by = "user_id")
# Step 2: from those, remove users who completed disgust_scores
study6 <- anti_join(no_personality, disgust_scores, by = "user_id")
glimpse(study6)
## Rows: 17,728
## Columns: 3
## $ user_id <dbl> 9, 10, 17, 19, 20, 21, 22, 23, 24, 27, 30, 31, 32, 33, 34, 3…
## $ sex <chr> "male", "female", "female", "female", "male", "male", "male"…
## $ birthyear <dbl> 1972, 1978, 1981, 1980, 1964, 1945, 1973, 1985, 1982, 1965, …
Load new user data from users2.
Bind this table and the original users table into a single
table called users_all.
users2 <- read_csv("https://psyteachr.github.io/reprores-v3/data/users2.csv")
users_all <- bind_rows(users, users2)
glimpse(users_all)
## Rows: 112,043
## Columns: 3
## $ user_id <dbl> 0, 1, 2, 5, 8, 9, 10, 17, 19, 20, 21, 22, 23, 24, 27, 30, 31…
## $ sex <chr> NA, "female", "male", "male", "male", "male", "female", "fem…
## $ birthyear <dbl> NA, 1976, 1985, 1980, 1968, 1972, 1978, 1981, 1980, 1964, 19…
How many users are in both the first and second user table? Use code
to get this number; don’t read the row number from the environment and
type it in. (Hint: What does nrow(mtcars)
return?)
both_users <- inner_join(users, users2, by = "user_id")
both_n <- nrow(both_users)
both_n
## [1] 11603
How many unique users are there in total across the first and second user tables?
users_all <- bind_rows(users, users2)
unique_users <- n_distinct(users_all$user_id)
unique_users
## [1] 100440
How many users are in the first, but not the second, user table?
first_only <- anti_join(users, users2, by = "user_id")
first_users <- nrow(first_only)
first_users
## [1] 40440
How many users are in the second, but not the first, user table?
second_only <- anti_join(users2, users, by = "user_id")
second_users <- nrow(second_only)
second_users
## [1] 48397