# if you haven't run this code before, you'll need to download the below packages first
# instructions on how to do this are included in the video
# but as a reminder, you use the packages tab to the right
library(psych) # for the describe() command
## Warning: package 'psych' was built under R version 4.3.2
library(naniar) # for the gg_miss-upset() command
## Warning: package 'naniar' was built under R version 4.3.2
library(expss) # for the cross_cases() command
## Warning: package 'expss' was built under R version 4.3.2
## Loading required package: maditr
## Warning: package 'maditr' was built under R version 4.3.2
##
## To drop variable use NULL: let(mtcars, am = NULL) %>% head()
##
## Use 'expss_output_rnotebook()' to display tables inside R Notebooks.
## To return to the console output, use 'expss_output_default()'.
##
## Attaching package: 'expss'
## The following object is masked from 'package:naniar':
##
## is_na
# for the lab, you'll import the CSV file you downloaded along with the current file we're working in (an RMD file)
# for the homework, you'll download the CSV file from your chosen README page (should be titled arc_data_final.csv or eammi2_data_final.csv)
df <- read.csv(file="Data/fakedata.csv", header=T)
# these are commands useful for viewing a dataframe
# you can also click the object in the environment tab to view it in a new window
names(df)
## [1] "id" "variable1" "variable2" "variable3" "variable4"
## [6] "variable5" "variable6" "variable7" "variable8" "variable9"
## [11] "variable10" "variable11" "variable12"
head(df)
## id variable1 variable2 variable3 variable4 variable5 variable6 variable7
## 1 id_1 level b level d level a level b 3.449743 3.4805072 1.039619
## 2 id_2 level b level c level a level c 2.707133 2.6169258 2.280291
## 3 id_3 level b level b level b level b 3.135185 3.2119597 1.854115
## 4 id_4 level c level d level b <NA> 2.901656 0.9048516 1.358520
## 5 id_5 level b level d level a <NA> 2.277099 2.9104901 1.343883
## 6 id_6 level b level d level b level c 2.404385 2.5616353 1.670581
## variable8 variable9 variable10 variable11 variable12
## 1 4.460219 1.153984 5.056802 1.546471 6.866127
## 2 4.754638 1.474888 4.214870 1.146760 7.208780
## 3 3.518784 1.218142 4.116025 2.746469 5.253641
## 4 4.617069 1.066567 4.670609 1.206538 5.847952
## 5 3.221986 1.374207 4.861695 1.575204 6.936754
## 6 4.892009 1.143127 4.459852 2.765479 6.697210
str(df)
## 'data.frame': 1000 obs. of 13 variables:
## $ id : chr "id_1" "id_2" "id_3" "id_4" ...
## $ variable1 : chr "level b" "level b" "level b" "level c" ...
## $ variable2 : chr "level d" "level c" "level b" "level d" ...
## $ variable3 : chr "level a" "level a" "level b" "level b" ...
## $ variable4 : chr "level b" "level c" "level b" NA ...
## $ variable5 : num 3.45 2.71 3.14 2.9 2.28 ...
## $ variable6 : num 3.481 2.617 3.212 0.905 2.91 ...
## $ variable7 : num 1.04 2.28 1.85 1.36 1.34 ...
## $ variable8 : num 4.46 4.75 3.52 4.62 3.22 ...
## $ variable9 : num 1.15 1.47 1.22 1.07 1.37 ...
## $ variable10: num 5.06 4.21 4.12 4.67 4.86 ...
## $ variable11: num 1.55 1.15 2.75 1.21 1.58 ...
## $ variable12: num 6.87 7.21 5.25 5.85 6.94 ...
# for the HW: use the codebook you created in the codebook activity to get the names of your variables (first column)
# enter this list of names in the select=c() argument to subset those columns from the dataframe
# variables for the lab: id, variable2, variable3, variable5, variable8, variable10, variable11
d <- subset(df, select=c(id, variable2, variable3, variable5, variable8, variable10, variable11))
# you can write a comment like this. this is where i will list my variables!!! very important!
# categorical variables need to be recoded as factors
# the content of the variable will stay the same, but R will treat the variable differently at times
d$variable2 <- as.factor(d$variable2)
d$variable3 <- as.factor(d$variable3)
str(d)
## 'data.frame': 1000 obs. of 7 variables:
## $ id : chr "id_1" "id_2" "id_3" "id_4" ...
## $ variable2 : Factor w/ 6 levels "level a","level b",..: 4 3 2 4 4 4 3 2 4 3 ...
## $ variable3 : Factor w/ 2 levels "level a","level b": 1 1 2 2 1 2 2 2 2 2 ...
## $ variable5 : num 3.45 2.71 3.14 2.9 2.28 ...
## $ variable8 : num 4.46 4.75 3.52 4.62 3.22 ...
## $ variable10: num 5.06 4.21 4.12 4.67 4.86 ...
## $ variable11: num 1.55 1.15 2.75 1.21 1.58 ...
We looked at the missing data in our dataset, and found that about 12% of the participants in our sample skipped at least one item. We dropped these participants from our analysis, which is not advisable and runs the risk of dropping vulnerable groups or skewing results. However, we will proceed for the sake of this class using the reduced dataset.
# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d[-1], nsets = 6)
# use the na.omit() command to create a new dataframe in which any participants with missing data are dropped from the dataframe
d2 <- na.omit(d)
889/1000
## [1] 0.889
# last step is to export the data after you've dropped NAs
# for the HW, the file you're exporting here is what you'll use for all future HW assignments (labs will use the files I provide you)
# make sure you give it a name that is memorable!
# and make sure you save it to your Data folder!
write.csv(d2, file="Data/fakedata_final.csv", row.names = F)
# since we've created a cleaned dataframe in d2, we'll use that for the rest of the lab/HW
table(d2$variable2)
##
## level a level b level c level d level e level f
## 29 233 351 228 45 3
table(d2$variable3)
##
## level a level b
## 200 689
hist(d2$variable5)
hist(d2$variable8)
hist(d2$variable10)
hist(d2$variable11)
We analyzed the skew and kurtosis of our continuous variables and all were within the accepted range (-2/+2).
We analyzed the skew and kurtosis of our … and most were within the accepted range (-2/+2). However, some variables (list them in parentheses) were outside of the accepted range. For this analysis, we will use them anyway, but outside of this class this is bad practice.
describe(d2)
## vars n mean sd median trimmed mad min max range
## id* 1 889 445.00 256.78 445.00 445.00 329.14 1.00 889.00 888.00
## variable2* 2 889 3.04 0.94 3.00 3.02 1.48 1.00 6.00 5.00
## variable3* 3 889 1.78 0.42 2.00 1.84 0.00 1.00 2.00 1.00
## variable5 4 889 2.53 0.49 2.51 2.52 0.50 1.17 4.15 2.99
## variable8 5 889 3.89 0.65 3.96 3.93 0.68 1.36 5.00 3.63
## variable10 6 889 4.87 0.97 4.90 4.90 0.95 1.04 6.98 5.94
## variable11 7 889 1.82 0.61 1.72 1.76 0.64 1.00 3.99 2.99
## skew kurtosis se
## id* 0.00 -1.20 8.61
## variable2* 0.15 -0.30 0.03
## variable3* -1.32 -0.27 0.01
## variable5 0.14 -0.07 0.02
## variable8 -0.57 0.03 0.02
## variable10 -0.35 0.27 0.03
## variable11 0.83 0.15 0.02
cross_cases(d2, variable2, variable3)
| Â variable3Â | ||
|---|---|---|
|  level a |  level b | |
| Â variable2Â | ||
|    level a | 5 | 24 |
|    level b | 56 | 177 |
|    level c | 79 | 272 |
|    level d | 46 | 182 |
|    level e | 13 | 32 |
|    level f | 1 | 2 |
|    #Total cases | 200 | 689 |
plot(d2$variable5, d2$variable8,
main="Scatterplot of Variable5 and Variable8",
xlab = "Variable5",
ylab = "Variable8")
plot(d2$variable10, d2$variable11,
main="Scatterplot of Variable10 and Variable11",
xlab = "Variable10",
ylab = "Variable11")
boxplot(data=d2, variable5~variable2,
main="Boxplot of Variable2 and Variable5",
xlab = "Variable2",
ylab = "Variable5")
boxplot(data=d2, variable11~variable3,
main="Boxplot of Variable3 and Variable11",
xlab = "Variable3",
ylab = "Variable11")