# if you haven't run this code before, you'll need to download the below packages first
# instructions on how to do this are included in the video
# but as a reminder, you use the packages tab to the right
library(tidyverse) # for the map() command
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych) # for the describe() command
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(naniar) # for the gg_miss-upset() command
library(expss) # for the cross_cases() command
## Loading required package: maditr
##
## To drop variable use NULL: let(mtcars, am = NULL) %>% head()
##
##
## Attaching package: 'maditr'
##
## The following objects are masked from 'package:dplyr':
##
## between, coalesce, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
##
## The following object is masked from 'package:readr':
##
## cols
##
##
## Attaching package: 'expss'
##
## The following object is masked from 'package:naniar':
##
## is_na
##
## The following objects are masked from 'package:stringr':
##
## fixed, regex
##
## The following objects are masked from 'package:dplyr':
##
## compute, contains, na_if, recode, vars, where
##
## The following objects are masked from 'package:purrr':
##
## keep, modify, modify_if, when
##
## The following objects are masked from 'package:tidyr':
##
## contains, nest
##
## The following object is masked from 'package:ggplot2':
##
## vars
# for the lab, you'll import the CSV file you downloaded along with the current file we're working in (an RMD file)
# for the homework, you'll download the CSV file from your chosen README page (should be titled arc_data_final.csv or eammi2_data_final.csv)
df <- read.csv(file="Data/fakedata.csv", header=T)
# these are commands useful for viewing a dataframe
# you can also click the object in the environment tab to view it in a new window
names(df)
## [1] "id" "variable1" "variable2" "variable3" "variable4"
## [6] "variable5" "variable6" "variable7" "variable8" "variable9"
## [11] "variable10" "variable11" "variable12"
head(df)
## id variable1 variable2 variable3 variable4 variable5 variable6 variable7
## 1 id_1 level b level d level a level b 3.449743 3.4805072 1.039619
## 2 id_2 level b level c level a level c 2.707133 2.6169258 2.280291
## 3 id_3 level b level b level b level b 3.135185 3.2119597 1.854115
## 4 id_4 level c level d level b <NA> 2.901656 0.9048516 1.358520
## 5 id_5 level b level d level a <NA> 2.277099 2.9104901 1.343883
## 6 id_6 level b level d level b level c 2.404385 2.5616353 1.670581
## variable8 variable9 variable10 variable11 variable12
## 1 4.460219 1.153984 5.056802 1.546471 6.866127
## 2 4.754638 1.474888 4.214870 1.146760 7.208780
## 3 3.518784 1.218142 4.116025 2.746469 5.253641
## 4 4.617069 1.066567 4.670609 1.206538 5.847952
## 5 3.221986 1.374207 4.861695 1.575204 6.936754
## 6 4.892009 1.143127 4.459852 2.765479 6.697210
str(df)
## 'data.frame': 1000 obs. of 13 variables:
## $ id : chr "id_1" "id_2" "id_3" "id_4" ...
## $ variable1 : chr "level b" "level b" "level b" "level c" ...
## $ variable2 : chr "level d" "level c" "level b" "level d" ...
## $ variable3 : chr "level a" "level a" "level b" "level b" ...
## $ variable4 : chr "level b" "level c" "level b" NA ...
## $ variable5 : num 3.45 2.71 3.14 2.9 2.28 ...
## $ variable6 : num 3.481 2.617 3.212 0.905 2.91 ...
## $ variable7 : num 1.04 2.28 1.85 1.36 1.34 ...
## $ variable8 : num 4.46 4.75 3.52 4.62 3.22 ...
## $ variable9 : num 1.15 1.47 1.22 1.07 1.37 ...
## $ variable10: num 5.06 4.21 4.12 4.67 4.86 ...
## $ variable11: num 1.55 1.15 2.75 1.21 1.58 ...
## $ variable12: num 6.87 7.21 5.25 5.85 6.94 ...
# for the HW: use the codebook you created in the codebook activity to get the names of your variables (first column)
# enter this list of names in the select=c() argument to subset those columns from the dataframe
# variables for the lab: id, variable2, variable3, variable5, variable8, variable10, variable11
d <- subset(df, select=c(id, variable2, variable3, variable5, variable8, variable10, variable11))
# categorical variables need to be recoded as factors
# the content of the variable will stay the same, but R will treat the variable differently at times
d$id <- as.factor(d$id)
d$variable2 <- as.factor(d$variable2)
d$variable3 <- as.factor(d$variable3)
str(d)
## 'data.frame': 1000 obs. of 7 variables:
## $ id : Factor w/ 1000 levels "id_1","id_10",..: 1 113 224 335 446 557 668 779 890 2 ...
## $ variable2 : Factor w/ 6 levels "level a","level b",..: 4 3 2 4 4 4 3 2 4 3 ...
## $ variable3 : Factor w/ 2 levels "level a","level b": 1 1 2 2 1 2 2 2 2 2 ...
## $ variable5 : num 3.45 2.71 3.14 2.9 2.28 ...
## $ variable8 : num 4.46 4.75 3.52 4.62 3.22 ...
## $ variable10: num 5.06 4.21 4.12 4.67 4.86 ...
## $ variable11: num 1.55 1.15 2.75 1.21 1.58 ...
# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d[-1], nsets = 6)
# use the na.omit() command to create a new dataframe in which any participants with missing data are dropped from the dataframe
d2 <- na.omit(d)
# last step is to export the data after you've dropped NAs
# for the HW, the file you're exporting here is what you'll use for all future HW assignments (labs will use the files I provide you)
# make sure you give it a name that is memorable!
# and make sure you save it to your Data folder!
write.csv(d2, file="Data/fakedata_clean_FINAL.csv", row.names = F)
# since we've created a cleaned dataframe in d2, we'll use that for the rest of the lab/HW
table(d2$variable2)
##
## level a level b level c level d level e level f
## 29 233 351 228 45 3
table(d2$variable3)
##
## level a level b
## 200 689
hist(d2$variable5)
hist(d2$variable8)
hist(d2$variable10)
hist(d2$variable11)
Cutoffs are -2 to +2. Varibale 5 = skew and kurtosis are okay. Varibale 8 = skew and kurtosis are okay. Varibale 10 = skew and kurtosis are okay. Varibale 11 = skew and kurtosis are okay.
describe(d2)
## vars n mean sd median trimmed mad min max range
## id* 1 889 503.18 285.14 504.00 503.65 364.72 1.00 999.00 998.00
## variable2* 2 889 3.04 0.94 3.00 3.02 1.48 1.00 6.00 5.00
## variable3* 3 889 1.78 0.42 2.00 1.84 0.00 1.00 2.00 1.00
## variable5 4 889 2.53 0.49 2.51 2.52 0.50 1.17 4.15 2.99
## variable8 5 889 3.89 0.65 3.96 3.93 0.68 1.36 5.00 3.63
## variable10 6 889 4.87 0.97 4.90 4.90 0.95 1.04 6.98 5.94
## variable11 7 889 1.82 0.61 1.72 1.76 0.64 1.00 3.99 2.99
## skew kurtosis se
## id* -0.01 -1.18 9.56
## variable2* 0.15 -0.30 0.03
## variable3* -1.32 -0.27 0.01
## variable5 0.14 -0.07 0.02
## variable8 -0.57 0.03 0.02
## variable10 -0.35 0.27 0.03
## variable11 0.83 0.15 0.02
cross_cases(d2, variable2, variable3)
| Â variable3Â | ||
|---|---|---|
|  level a |  level b | |
| Â variable2Â | ||
|    level a | 5 | 24 |
|    level b | 56 | 177 |
|    level c | 79 | 272 |
|    level d | 46 | 182 |
|    level e | 13 | 32 |
|    level f | 1 | 2 |
|    #Total cases | 200 | 689 |
plot(d2$variable5, d2$variable8,
main="Scatterplot of variable5 and variable8",
xlab = "variable5",
ylab = "variable8")
plot(d2$variable10, d2$variable11,
main="Scatterplot of variable10 and variable11",
xlab = "variable10",
ylab = "variable11")
boxplot(data=d2, variable5~variable2,
main="Boxplot of variable5 and variable2",
xlab = "x variable2",
ylab = "Y variable5")
boxplot(data=d2, variable8~variable3,
main="Boxplot of variable8 and variable3",
xlab = "x variable3",
ylab = "Y variable8")