1 Data Cleaning

1.1 Load Libraries

library(tidyverse) # for the map() command
library(psych) # for the describe () command
library(naniar) # for the gg_miss-upset() command
library(expss) # for the cross_cases() command
library(ggplot2)

1.2 Import Data

df <- read.csv(file="/Users/lydiaschwartz/Desktop/r studio/Data Cleaning and Basic Statistics HW/EAMMi2-Data1.2.csv", header=T)

1.3 Viewing Data

names(df)

head(df)

str(df)

1.4 Subsetting Data

d <- subset(df, select=c(belnow, marriage2,marriage5,income, race, politics, party, efficacy_1, efficacy_2, efficacy_3, efficacy_4, efficacy_5, efficacy_6, efficacy_7, efficacy_8, efficacy_9, efficacy_10))

              

1.5 Basic Data Checking

1.5.1 Checking Values

d %>%
    map(table, useNA = "always")

1.5.2 Recoding data


table(d$race, useNA = "always")
d$race_rc <- NA
d$race_rc[d$race == 1] <- "white"
d$race_rc[d$race == 2] <- "black"
d$race_rc[d$race == 3] <- "hispanic"
d$race_rc[d$race == 4] <- "asian"
d$race_rc[d$race == 5] <- "nativeamer"
d$race_rc[d$race == 6] <- "other"

1.5.3 Factor Scores/Composite Variables

# use the str() command to check that your recoded variable is numeric so you can use mathematical operators on it
str(d)


d$efficacy <- (d$efficacy_1 + d$efficacy_2 + d$efficacy_3 + d$efficacy_4 + d$efficacy_5 + d$efficacy_6 + d$efficacy_7 + d$efficacy_8 + d$efficacy_9 + d$efficacy_10)/10

d$politicalviews <- (d$politics + d$party)/2

d$marriageimportance <- (d$marriage2)

d$belong <- (d$belnow)

1.6 Exporting Data

d2 <- subset(d, select=c(efficacy, belong, marriageimportance, race_rc, politicalviews, income))

write.csv(d2, file="/Users/lydiaschwartz/Desktop/r studio/Data Cleaning and Basic Statistics HW/final.csv", row.names = F)

2 Basic Statistics

2.1 Import Cleaned Data

d2 <- read.csv(file= "/Users/lydiaschwartz/Desktop/r studio/Data Cleaning and Basic Statistics HW/final.csv", header=T) 

2.2 Check Data

2.2.1 Formatting

head(d2)
str(d2)

d2$race <- as.factor(d2$race)
d2$income <- as.factor(d2$income)

2.3 Making a new subset with updated race (from numbers to written form)

d2 <- subset(d2, select=c(efficacy, belong, marriageimportance, race_rc, politicalviews, income))

2.4 Univariate Normality

describe(d2)

2.5 Histograms

# use the hist() command to create a histogram for your continuous variables
hist(d$efficacy)
hist(d$belong)
hist(d$marriageimportance)
hist(d$politicalviews)



# use the table() command to create a table for your categorical variables (other than your ID variable)
table(d2$race_rc, useNA = "always")
table(d2$income, useNA = "always")

2.6 Missing Data

# use the gg_miss_upset() command to visualize your missing data
gg_miss_upset(data=d, nsets = "6")

# create a new dataframe with only your complete cases/observations
d2 <- na.omit(d)

2.7 Crosstabs & Scatterplots

2.7.1 Crosstabs

# use the cross_cases() command to create a crosstab of your categorical variables
cross_cases(d2, race_rc, income)

2.7.2 Scatterplots

# use the plot() command to create scatterplots of your continuous variables
plot(d2$efficacy, d2$belong,
     main="scatterplot of efficacy and belonging",
     xlab = "efficacy",
     ylab = "belonging")

# use the plot() command to create scatterplots of your continuous variables
plot(d2$marriageimportance, d2$politicalviews,
     main="scatterplot of marriage importance and political views",
     xlab = "marriage importance",
     ylab = "political views")

# use the plot() command to create scatterplots of your continuous variables
plot(d2$efficacy, d2$marriageimportance,
     main="scatterplot of efficacy marriage importance",
     xlab = "marriage importance",
     ylab = "efficacy")


# use the plot() command to create scatterplots of your continuous variables
plot(d2$politicalviews, d2$belonging,
     main="scatterplot of political views and belonging",
     xlab = "political views",
     ylab = "belonging")

2.8 Boxplots

# use the boxplot() command to create boxplots of your continuous and categorical variables
boxplot(data=d2, income~race_rc,
        main="income and race",
        xlab = "race",
        ylab = "income")

boxplot(data=d2, marriageimportance~race_rc,
        main="race and marriage",
        xlab = "race",
        ylab = "marriage importance")

boxplot(data=d2, efficacy~race_rc,
        main="race and efficacy",
        xlab = "race",
        ylab = "efficacy")


num_na <- sum(is.na(d$income))
num_na <- sum(is.na(df$race_rc))
num_na <- sum(is.na(d$belong))
num_na <- sum(is.na(d$politicalviews))
num_na <- sum(is.na(d$marriageimportance))
num_na <- sum(is.na(d$efficacy))

sample_size <- nrow(d2)
sample_size <- nrow(d)


item_non_response_percentage <- (3182 - 3130) / 3182 * 100

3 Homework Write up

do your continuous variables meet the criteria for univariate normality? Skew and kurtosis should be between -2 and +2.

Yes, my continuous variables meet this criteria.

Efficacy: skew = -0.24, kurtosis = 0.46

Marriage Opinions: skew = -0.60, kurtosis = -0.33

Political Views: skew = 0.40, kurtosis = -0.92

Belonging: skew = -0.62, kurtosis = 0.05

Do you have any missing data? Once you have removed the cases/participants with missing data, what is your total sample size? Please discuss how much data is missing and whether it’s due to survey design or individual non-response.

I do have missing data. 25 participants did not report an income. 19 participants did not report their political view. 10 Participants did not report their percieved marriage importance. 4 participants did not report their percieved belonging. 6 Participants did not report their percieved self-efficacy.

Due to how the the survey was designed (unit non response), 19 participants did not see the income question, 15 participants did not see the political views question, 6 did not see the marriage importance question, and 3 did not see the efficacy question. 12 participants skipped some of the items selected for analysis (item nonresponse). The percentage of item non response from the number of participants who responded fully is 1.63%, below the 5% cutoff considered a concern by the literature. The original sample size was 3182 Participants. After the missing data was removed, the sample size changed to 3130.