Data Prep

Load Libraries

# if you haven't run this code before, you'll need to download the below packages first
# instructions on how to do this are included in the video
# but as a reminder, you use the packages tab to the right

library(psych) # for the describe() command
library(naniar) # for the gg_miss-upset() command
library(expss) # for the cross_cases() command

## 载入需要的程辑包：maditr

## 
## To aggregate all non-grouping columns: take_all(mtcars, mean, by = am)

## 
## 载入程辑包：'expss'

## The following object is masked from 'package:naniar':
## 
##     is_na

Import Data

# for the lab, you'll import the CSV file you downloaded along with the current file we're working in (an RMD file)
# for the homework, you'll download the CSV file from your chosen README page (should be titled arc_data_final.csv or eammi2_data_final.csv)
df <- read.csv(file="Data/fakedata.csv", header=T)

Viewing Data

# these are commands useful for viewing a dataframe
# you can also click the object in the environment tab to view it in a new window
names(df)

##  [1] "id"         "variable1"  "variable2"  "variable3"  "variable4" 
##  [6] "variable5"  "variable6"  "variable7"  "variable8"  "variable9" 
## [11] "variable10" "variable11" "variable12"

head(df)

##     id variable1 variable2 variable3 variable4 variable5 variable6 variable7
## 1 id_1   level b   level d   level a   level b  3.449743 3.4805072  1.039619
## 2 id_2   level b   level c   level a   level c  2.707133 2.6169258  2.280291
## 3 id_3   level b   level b   level b   level b  3.135185 3.2119597  1.854115
## 4 id_4   level c   level d   level b      <NA>  2.901656 0.9048516  1.358520
## 5 id_5   level b   level d   level a      <NA>  2.277099 2.9104901  1.343883
## 6 id_6   level b   level d   level b   level c  2.404385 2.5616353  1.670581
##   variable8 variable9 variable10 variable11 variable12
## 1  4.460219  1.153984   5.056802   1.546471   6.866127
## 2  4.754638  1.474888   4.214870   1.146760   7.208780
## 3  3.518784  1.218142   4.116025   2.746469   5.253641
## 4  4.617069  1.066567   4.670609   1.206538   5.847952
## 5  3.221986  1.374207   4.861695   1.575204   6.936754
## 6  4.892009  1.143127   4.459852   2.765479   6.697210

str(df)

## 'data.frame':    1000 obs. of  13 variables:
##  $ id        : chr  "id_1" "id_2" "id_3" "id_4" ...
##  $ variable1 : chr  "level b" "level b" "level b" "level c" ...
##  $ variable2 : chr  "level d" "level c" "level b" "level d" ...
##  $ variable3 : chr  "level a" "level a" "level b" "level b" ...
##  $ variable4 : chr  "level b" "level c" "level b" NA ...
##  $ variable5 : num  3.45 2.71 3.14 2.9 2.28 ...
##  $ variable6 : num  3.481 2.617 3.212 0.905 2.91 ...
##  $ variable7 : num  1.04 2.28 1.85 1.36 1.34 ...
##  $ variable8 : num  4.46 4.75 3.52 4.62 3.22 ...
##  $ variable9 : num  1.15 1.47 1.22 1.07 1.37 ...
##  $ variable10: num  5.06 4.21 4.12 4.67 4.86 ...
##  $ variable11: num  1.55 1.15 2.75 1.21 1.58 ...
##  $ variable12: num  6.87 7.21 5.25 5.85 6.94 ...

Subsetting Data

# for the HW: use the codebook you created in the codebook activity to get the names of your variables (first column)
# enter this list of names in the select=c() argument to subset those columns from the dataframe
# variables for the lab: id, variable2, variable3, variable5, variable8, variable10, variable11
d <- subset(df, select=c(id, variable2, variable3, variable5, variable8, variable10, variable11))
#you can write a commnet like this. this is where i will list my variables!!! very important

Recoding Variables

# categorical variables need to be recoded as factors
# the content of the variable will stay the same, but R will treat the variable differently at times
d$variable2 <- as.factor(d$variable2)
d$variable3 <- as.factor(d$variable3)

str(d)

## 'data.frame':    1000 obs. of  7 variables:
##  $ id        : chr  "id_1" "id_2" "id_3" "id_4" ...
##  $ variable2 : Factor w/ 6 levels "level a","level b",..: 4 3 2 4 4 4 3 2 4 3 ...
##  $ variable3 : Factor w/ 2 levels "level a","level b": 1 1 2 2 1 2 2 2 2 2 ...
##  $ variable5 : num  3.45 2.71 3.14 2.9 2.28 ...
##  $ variable8 : num  4.46 4.75 3.52 4.62 3.22 ...
##  $ variable10: num  5.06 4.21 4.12 4.67 4.86 ...
##  $ variable11: num  1.55 1.15 2.75 1.21 1.58 ...

Missing Data

we look at rge missing data in our dataset, and found that about 12% of the participants in our sample skipped at least one item. We dropped these participants from our analysis, which is not advisable and runs the risk of dropping vulnerable groups

# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d[-1], nsets = 6)

# use the na.omit() command to create a new dataframe in which any participants with missing data are dropped from the dataframe
d2 <- na.omit(d)
889/1000

## [1] 0.889

Exporting Data

# last step is to export the data after you've dropped NAs
# for the HW, the file you're exporting here is what you'll use for all future HW assignments (labs will use the files I provide you)
# make sure you give it a name that is memorable!
# and make sure you save it to your Data folder!
write.csv(d2, file="Data/fakedata_final.csv", row.names = F)

# since we've created a cleaned dataframe in d2, we'll use that for the rest of the lab/HW

Basic Statistics

Univariate Plots: Histograms & Tables

table(d2$variable2)

## 
## level a level b level c level d level e level f 
##      29     233     351     228      45       3

table(d2$variable3)

## 
## level a level b 
##     200     689

hist(d2$variable5)

hist(d2$variable8)

hist(d2$variable10)

hist(d2$variable11)

Univariate Normality

We analyzed the skew and kurtosis of our continous variables and all were within the accepted range(-2/+2)

Weanalyzed the skew and kurtosis of our … and most were within the accepted range (-2/+2). However, some variables (list them in parentheses) were outside of the accepted range. For this analysis, we will use them anyway, but outside of this class his is bad practice.

describe(d2)

##            vars   n   mean     sd median trimmed    mad  min    max  range
## id*           1 889 445.00 256.78 445.00  445.00 329.14 1.00 889.00 888.00
## variable2*    2 889   3.04   0.94   3.00    3.02   1.48 1.00   6.00   5.00
## variable3*    3 889   1.78   0.42   2.00    1.84   0.00 1.00   2.00   1.00
## variable5     4 889   2.53   0.49   2.51    2.52   0.50 1.17   4.15   2.99
## variable8     5 889   3.89   0.65   3.96    3.93   0.68 1.36   5.00   3.63
## variable10    6 889   4.87   0.97   4.90    4.90   0.95 1.04   6.98   5.94
## variable11    7 889   1.82   0.61   1.72    1.76   0.64 1.00   3.99   2.99
##             skew kurtosis   se
## id*         0.00    -1.20 8.61
## variable2*  0.15    -0.30 0.03
## variable3* -1.32    -0.27 0.01
## variable5   0.14    -0.07 0.02
## variable8  -0.57     0.03 0.02
## variable10 -0.35     0.27 0.03
## variable11  0.83     0.15 0.02

Bivariate Plots

Crosstabs

cross_cases(d2, variable2, variable3)

	variable3
	level a	level b
variable2
level a	5	24
level b	56	177
level c	79	272
level d	46	182
level e	13	32
level f	1	2
#Total cases	200	689

Scatterplots

plot(d2$variable5, d2$variable8,
     main="Scatterplot of Variable5 and Variable8",
     xlab = "Variable5",
     ylab = "Variable8")

plot(d2$variable10, d2$variable11,
     main="Scatterplot of Variable10 and Variable11",
     xlab = "Variable10",
     ylab = "Variable11")

Boxplots

boxplot(data=d2, variable5~variable2,
        main="Boxplot of Variable2 and Variable5",
        xlab = "Variable2",
        ylab = "Variable5")

boxplot(data=d2, variable11~variable3,
        main="Boxplot of Variable3 and Variable11",
        xlab = "Variable3",
        ylab = "Variable11")

P421 Lab - Data Prep & Basic Statistics

Hao Chen

2024-02-22