# if you haven't run this code before, you'll need to download the below packages first
# instructions on how to do this are included in the video
# but as a reminder, you use the packages tab to the right
library(psych) # for the describe() command
library(naniar) # for the gg_miss-upset() command
library(expss) # for the cross_cases() command
## 载入需要的程辑包:maditr
##
## To aggregate all non-grouping columns: take_all(mtcars, mean, by = am)
##
## 载入程辑包:'expss'
## The following object is masked from 'package:naniar':
##
## is_na
# for the lab, you'll import the CSV file you downloaded along with the current file we're working in (an RMD file)
# for the homework, you'll download the CSV file from your chosen README page (should be titled arc_data_final.csv or eammi2_data_final.csv)
df <- read.csv(file="Data/fakedata.csv", header=T)
# these are commands useful for viewing a dataframe
# you can also click the object in the environment tab to view it in a new window
names(df)
## [1] "id" "variable1" "variable2" "variable3" "variable4"
## [6] "variable5" "variable6" "variable7" "variable8" "variable9"
## [11] "variable10" "variable11" "variable12"
head(df)
## id variable1 variable2 variable3 variable4 variable5 variable6 variable7
## 1 id_1 level b level d level a level b 3.449743 3.4805072 1.039619
## 2 id_2 level b level c level a level c 2.707133 2.6169258 2.280291
## 3 id_3 level b level b level b level b 3.135185 3.2119597 1.854115
## 4 id_4 level c level d level b <NA> 2.901656 0.9048516 1.358520
## 5 id_5 level b level d level a <NA> 2.277099 2.9104901 1.343883
## 6 id_6 level b level d level b level c 2.404385 2.5616353 1.670581
## variable8 variable9 variable10 variable11 variable12
## 1 4.460219 1.153984 5.056802 1.546471 6.866127
## 2 4.754638 1.474888 4.214870 1.146760 7.208780
## 3 3.518784 1.218142 4.116025 2.746469 5.253641
## 4 4.617069 1.066567 4.670609 1.206538 5.847952
## 5 3.221986 1.374207 4.861695 1.575204 6.936754
## 6 4.892009 1.143127 4.459852 2.765479 6.697210
str(df)
## 'data.frame': 1000 obs. of 13 variables:
## $ id : chr "id_1" "id_2" "id_3" "id_4" ...
## $ variable1 : chr "level b" "level b" "level b" "level c" ...
## $ variable2 : chr "level d" "level c" "level b" "level d" ...
## $ variable3 : chr "level a" "level a" "level b" "level b" ...
## $ variable4 : chr "level b" "level c" "level b" NA ...
## $ variable5 : num 3.45 2.71 3.14 2.9 2.28 ...
## $ variable6 : num 3.481 2.617 3.212 0.905 2.91 ...
## $ variable7 : num 1.04 2.28 1.85 1.36 1.34 ...
## $ variable8 : num 4.46 4.75 3.52 4.62 3.22 ...
## $ variable9 : num 1.15 1.47 1.22 1.07 1.37 ...
## $ variable10: num 5.06 4.21 4.12 4.67 4.86 ...
## $ variable11: num 1.55 1.15 2.75 1.21 1.58 ...
## $ variable12: num 6.87 7.21 5.25 5.85 6.94 ...
# for the HW: use the codebook you created in the codebook activity to get the names of your variables (first column)
# enter this list of names in the select=c() argument to subset those columns from the dataframe
# variables for the lab: id, variable2, variable3, variable5, variable8, variable10, variable11
d <- subset(df, select=c(id, variable2, variable3, variable5, variable8, variable10, variable11))
#you can write a commnet like this. this is where i will list my variables!!! very important
# categorical variables need to be recoded as factors
# the content of the variable will stay the same, but R will treat the variable differently at times
d$variable2 <- as.factor(d$variable2)
d$variable3 <- as.factor(d$variable3)
str(d)
## 'data.frame': 1000 obs. of 7 variables:
## $ id : chr "id_1" "id_2" "id_3" "id_4" ...
## $ variable2 : Factor w/ 6 levels "level a","level b",..: 4 3 2 4 4 4 3 2 4 3 ...
## $ variable3 : Factor w/ 2 levels "level a","level b": 1 1 2 2 1 2 2 2 2 2 ...
## $ variable5 : num 3.45 2.71 3.14 2.9 2.28 ...
## $ variable8 : num 4.46 4.75 3.52 4.62 3.22 ...
## $ variable10: num 5.06 4.21 4.12 4.67 4.86 ...
## $ variable11: num 1.55 1.15 2.75 1.21 1.58 ...
we look at rge missing data in our dataset, and found that about 12% of the participants in our sample skipped at least one item. We dropped these participants from our analysis, which is not advisable and runs the risk of dropping vulnerable groups
# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d[-1], nsets = 6)
# use the na.omit() command to create a new dataframe in which any participants with missing data are dropped from the dataframe
d2 <- na.omit(d)
889/1000
## [1] 0.889
# last step is to export the data after you've dropped NAs
# for the HW, the file you're exporting here is what you'll use for all future HW assignments (labs will use the files I provide you)
# make sure you give it a name that is memorable!
# and make sure you save it to your Data folder!
write.csv(d2, file="Data/fakedata_final.csv", row.names = F)
# since we've created a cleaned dataframe in d2, we'll use that for the rest of the lab/HW
table(d2$variable2)
##
## level a level b level c level d level e level f
## 29 233 351 228 45 3
table(d2$variable3)
##
## level a level b
## 200 689
hist(d2$variable5)
hist(d2$variable8)
hist(d2$variable10)
hist(d2$variable11)
We analyzed the skew and kurtosis of our continous variables and all were within the accepted range(-2/+2)
Weanalyzed the skew and kurtosis of our … and most were within the accepted range (-2/+2). However, some variables (list them in parentheses) were outside of the accepted range. For this analysis, we will use them anyway, but outside of this class his is bad practice.
describe(d2)
## vars n mean sd median trimmed mad min max range
## id* 1 889 445.00 256.78 445.00 445.00 329.14 1.00 889.00 888.00
## variable2* 2 889 3.04 0.94 3.00 3.02 1.48 1.00 6.00 5.00
## variable3* 3 889 1.78 0.42 2.00 1.84 0.00 1.00 2.00 1.00
## variable5 4 889 2.53 0.49 2.51 2.52 0.50 1.17 4.15 2.99
## variable8 5 889 3.89 0.65 3.96 3.93 0.68 1.36 5.00 3.63
## variable10 6 889 4.87 0.97 4.90 4.90 0.95 1.04 6.98 5.94
## variable11 7 889 1.82 0.61 1.72 1.76 0.64 1.00 3.99 2.99
## skew kurtosis se
## id* 0.00 -1.20 8.61
## variable2* 0.15 -0.30 0.03
## variable3* -1.32 -0.27 0.01
## variable5 0.14 -0.07 0.02
## variable8 -0.57 0.03 0.02
## variable10 -0.35 0.27 0.03
## variable11 0.83 0.15 0.02
cross_cases(d2, variable2, variable3)
| variable3 | ||
|---|---|---|
| level a | level b | |
| variable2 | ||
| level a | 5 | 24 |
| level b | 56 | 177 |
| level c | 79 | 272 |
| level d | 46 | 182 |
| level e | 13 | 32 |
| level f | 1 | 2 |
| #Total cases | 200 | 689 |
plot(d2$variable5, d2$variable8,
main="Scatterplot of Variable5 and Variable8",
xlab = "Variable5",
ylab = "Variable8")
plot(d2$variable10, d2$variable11,
main="Scatterplot of Variable10 and Variable11",
xlab = "Variable10",
ylab = "Variable11")
boxplot(data=d2, variable5~variable2,
main="Boxplot of Variable2 and Variable5",
xlab = "Variable2",
ylab = "Variable5")
boxplot(data=d2, variable11~variable3,
main="Boxplot of Variable3 and Variable11",
xlab = "Variable3",
ylab = "Variable11")