Data Prep

Load Libraries

# if you haven't run this code before, you'll need to download the below packages first
# instructions on how to do this are included in the video
# but as a reminder, you use the packages tab to the right

library(tidyverse) # for the map() command

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(psych) # for the describe() command

## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(naniar) # for the gg_miss-upset() command
library(expss) # for the cross_cases() command

## Loading required package: maditr
## 
## To drop variable use NULL: let(mtcars, am = NULL) %>% head()
## 
## 
## Attaching package: 'maditr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, coalesce, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following object is masked from 'package:readr':
## 
##     cols
## 
## 
## Attaching package: 'expss'
## 
## The following object is masked from 'package:naniar':
## 
##     is_na
## 
## The following objects are masked from 'package:stringr':
## 
##     fixed, regex
## 
## The following objects are masked from 'package:dplyr':
## 
##     compute, contains, na_if, recode, vars, where
## 
## The following objects are masked from 'package:purrr':
## 
##     keep, modify, modify_if, when
## 
## The following objects are masked from 'package:tidyr':
## 
##     contains, nest
## 
## The following object is masked from 'package:ggplot2':
## 
##     vars

Import Data

# for the lab, you'll import the CSV file you downloaded along with the current file we're working in (an RMD file)
# for the homework, you'll download the CSV file from your chosen README page (should be titled arc_data_final.csv or eammi2_data_final.csv)
df <- read.csv(file="Data/fakedata.csv", header=T)

Viewing Data

# these are commands useful for viewing a dataframe
# you can also click the object in the environment tab to view it in a new window
names(df)

##  [1] "id"         "variable1"  "variable2"  "variable3"  "variable4" 
##  [6] "variable5"  "variable6"  "variable7"  "variable8"  "variable9" 
## [11] "variable10" "variable11" "variable12"

head(df)

##     id variable1 variable2 variable3 variable4 variable5 variable6 variable7
## 1 id_1   level b   level d   level a   level b  3.449743 3.4805072  1.039619
## 2 id_2   level b   level c   level a   level c  2.707133 2.6169258  2.280291
## 3 id_3   level b   level b   level b   level b  3.135185 3.2119597  1.854115
## 4 id_4   level c   level d   level b      <NA>  2.901656 0.9048516  1.358520
## 5 id_5   level b   level d   level a      <NA>  2.277099 2.9104901  1.343883
## 6 id_6   level b   level d   level b   level c  2.404385 2.5616353  1.670581
##   variable8 variable9 variable10 variable11 variable12
## 1  4.460219  1.153984   5.056802   1.546471   6.866127
## 2  4.754638  1.474888   4.214870   1.146760   7.208780
## 3  3.518784  1.218142   4.116025   2.746469   5.253641
## 4  4.617069  1.066567   4.670609   1.206538   5.847952
## 5  3.221986  1.374207   4.861695   1.575204   6.936754
## 6  4.892009  1.143127   4.459852   2.765479   6.697210

str(df)

## 'data.frame':    1000 obs. of  13 variables:
##  $ id        : chr  "id_1" "id_2" "id_3" "id_4" ...
##  $ variable1 : chr  "level b" "level b" "level b" "level c" ...
##  $ variable2 : chr  "level d" "level c" "level b" "level d" ...
##  $ variable3 : chr  "level a" "level a" "level b" "level b" ...
##  $ variable4 : chr  "level b" "level c" "level b" NA ...
##  $ variable5 : num  3.45 2.71 3.14 2.9 2.28 ...
##  $ variable6 : num  3.481 2.617 3.212 0.905 2.91 ...
##  $ variable7 : num  1.04 2.28 1.85 1.36 1.34 ...
##  $ variable8 : num  4.46 4.75 3.52 4.62 3.22 ...
##  $ variable9 : num  1.15 1.47 1.22 1.07 1.37 ...
##  $ variable10: num  5.06 4.21 4.12 4.67 4.86 ...
##  $ variable11: num  1.55 1.15 2.75 1.21 1.58 ...
##  $ variable12: num  6.87 7.21 5.25 5.85 6.94 ...

Subsetting Data

# for the HW: use the codebook you created in the codebook activity to get the names of your variables (first column)
# enter this list of names in the select=c() argument to subset those columns from the dataframe
# variables for the lab: id, variable2, variable3, variable5, variable8, variable10, variable11
d <- subset(df, select=c(id, variable2, variable3, variable5, variable8, variable10, variable11))

Recoding Variables

# categorical variables need to be recoded as factors
# the content of the variable will stay the same, but R will treat the variable differently at times
d$id <- as.factor(d$id)
d$variable2 <- as.factor(d$variable2)
d$variable3 <- as.factor(d$variable3)


str(d)

## 'data.frame':    1000 obs. of  7 variables:
##  $ id        : Factor w/ 1000 levels "id_1","id_10",..: 1 113 224 335 446 557 668 779 890 2 ...
##  $ variable2 : Factor w/ 6 levels "level a","level b",..: 4 3 2 4 4 4 3 2 4 3 ...
##  $ variable3 : Factor w/ 2 levels "level a","level b": 1 1 2 2 1 2 2 2 2 2 ...
##  $ variable5 : num  3.45 2.71 3.14 2.9 2.28 ...
##  $ variable8 : num  4.46 4.75 3.52 4.62 3.22 ...
##  $ variable10: num  5.06 4.21 4.12 4.67 4.86 ...
##  $ variable11: num  1.55 1.15 2.75 1.21 1.58 ...

Missing Data

# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d[-1], nsets = 6)

# use the na.omit() command to create a new dataframe in which any participants with missing data are dropped from the dataframe
d2 <- na.omit(d)

Exporting Data

# last step is to export the data after you've dropped NAs
# for the HW, the file you're exporting here is what you'll use for all future HW assignments (labs will use the files I provide you)
# make sure you give it a name that is memorable!
# and make sure you save it to your Data folder!
write.csv(d2, file="Data/fakedata_clean_FINAL.csv", row.names = F) 

# since we've created a cleaned dataframe in d2, we'll use that for the rest of the lab/HW

Basic Statistics

Univariate Plots: Histograms & Tables

table(d2$variable2)

## 
## level a level b level c level d level e level f 
##      29     233     351     228      45       3

table(d2$variable3)

## 
## level a level b 
##     200     689

hist(d2$variable5)

hist(d2$variable8)

hist(d2$variable10)

hist(d2$variable11)

Univariate Normality

Cutoffs are -2 to +2. Varibale 5 = skew and kurtosis are okay. Varibale 8 = skew and kurtosis are okay. Varibale 10 = skew and kurtosis are okay. Varibale 11 = skew and kurtosis are okay.

describe(d2)

##            vars   n   mean     sd median trimmed    mad  min    max  range
## id*           1 889 503.18 285.14 504.00  503.65 364.72 1.00 999.00 998.00
## variable2*    2 889   3.04   0.94   3.00    3.02   1.48 1.00   6.00   5.00
## variable3*    3 889   1.78   0.42   2.00    1.84   0.00 1.00   2.00   1.00
## variable5     4 889   2.53   0.49   2.51    2.52   0.50 1.17   4.15   2.99
## variable8     5 889   3.89   0.65   3.96    3.93   0.68 1.36   5.00   3.63
## variable10    6 889   4.87   0.97   4.90    4.90   0.95 1.04   6.98   5.94
## variable11    7 889   1.82   0.61   1.72    1.76   0.64 1.00   3.99   2.99
##             skew kurtosis   se
## id*        -0.01    -1.18 9.56
## variable2*  0.15    -0.30 0.03
## variable3* -1.32    -0.27 0.01
## variable5   0.14    -0.07 0.02
## variable8  -0.57     0.03 0.02
## variable10 -0.35     0.27 0.03
## variable11  0.83     0.15 0.02

Bivariate Plots

Crosstabs

cross_cases(d2, variable2, variable3)

	variable3
	level a	level b
variable2
level a	5	24
level b	56	177
level c	79	272
level d	46	182
level e	13	32
level f	1	2
#Total cases	200	689

Scatterplots

plot(d2$variable5, d2$variable8,
     main="Scatterplot of variable5 and variable8",
     xlab = "variable5",
     ylab = "variable8")

plot(d2$variable10, d2$variable11,
     main="Scatterplot of variable10 and variable11",
     xlab = "variable10",
     ylab = "variable11")

Boxplots

boxplot(data=d2, variable5~variable2,
        main="Boxplot of variable5 and variable2",
        xlab = "x variable2",
        ylab = "Y variable5")

boxplot(data=d2, variable8~variable3,
        main="Boxplot of variable8 and variable3",
        xlab = "x variable3",
        ylab = "Y variable8")

P421 Lab - Data Prep & Basic Statistics

Runtian Huang

2023-07-02

Data Prep

Load Libraries

Import Data

Viewing Data

Subsetting Data

Recoding Variables

Missing Data

Exporting Data

Basic Statistics

Univariate Plots: Histograms & Tables

Univariate Normality

Bivariate Plots

Crosstabs

Scatterplots

Boxplots