# data import, cleaning, and recoding including IV
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# let's use our own data from a pilot test I ran
# for your report, you should replace this data with the actual one!
<- read_csv("~/大四下/2-Research Design DACSS 602/Final Project/RD_Omnibus_SP24final_May+10,+2024_09.34.csv") projectdata
Rows: 140 Columns: 220
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (220): StartDate, EndDate, Status, IPAddress, Progress, Duration (in sec...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(projectdata) # example data
head(projectdata)
# A tibble: 6 × 220
StartDate EndDate Status IPAddress Progress Duration (in seconds…¹ Finished
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 "Start Date" "End D… "Resp… "IP Addr… "Progre… "Duration (in seconds… "Finish…
2 "{\"ImportI… "{\"Im… "{\"I… "{\"Impo… "{\"Imp… "{\"ImportId\":\"dura… "{\"Imp…
3 "2024-05-08… "2024-… "IP A… "68.104.… "100" "965" "True"
4 "2024-05-08… "2024-… "IP A… "172.56.… "100" "321" "True"
5 "2024-05-08… "2024-… "IP A… "138.88.… "100" "18" "True"
6 "2024-05-08… "2024-… "IP A… "73.176.… "100" "143" "True"
# ℹ abbreviated name: ¹`Duration (in seconds)`
# ℹ 213 more variables: RecordedDate <chr>, ResponseId <chr>,
# RecipientLastName <chr>, RecipientFirstName <chr>, RecipientEmail <chr>,
# ExternalReference <chr>, LocationLatitude <chr>, LocationLongitude <chr>,
# DistributionChannel <chr>, UserLanguage <chr>, Q_RecaptchaScore <chr>,
# Q_RelevantIDDuplicate <chr>, Q_RelevantIDDuplicateScore <chr>,
# Q_RelevantIDFraudScore <chr>, Q_RelevantIDLastStartDate <chr>, …
# subseting demographic questions for all groups
<- projectdata %>%
demos filter(Status != "Survey Preview") %>%
select(rid, age, gender, hhi, ethnicity, hispanic,
education, political_party, region, zip) # codebook is available in Canvas
dim(demos)
[1] 140 10
# Group 1 as an example
<- projectdata %>%
g4 filter(Status != "Survey Preview") %>% # remove previews
select(starts_with(c("g4EQ","g4_")), rid) %>%
full_join(demos) %>% # we want to keep all variables
filter(!row_number() %in% c(1, 2)) # drop the first two rows
Joining with `by = join_by(rid)`
Warning in full_join(., demos): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 94 of `x` matches multiple rows in `y`.
ℹ Row 94 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
"many-to-many"` to silence this warning.
head(g4)
# A tibble: 6 × 27
`g4EQ1 ` g4EQ2 g4EQ3 g4EQ4 g4EQ5 g4EQ6 g4EQ7 g4_DO_g4EQ3 g4_DO_g4EQ4
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Strongly agree Stro… Stro… Some… Some… Some… Some… 4 5
2 Somewhat agree Neit… Neit… Neit… Some… Some… Neit… 4 5
3 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
4 Somewhat disagree Some… Neit… Some… Stro… Neit… Some… 4 5
5 Somewhat agree Some… Some… Some… Some… Some… Some… 4 5
6 Neither agree nor… Some… Stro… Stro… Stro… Stro… Stro… 4 5
# ℹ 18 more variables: g4_DO_g4EQ5 <chr>, g4_DO_g4EQ6 <chr>, g4_DO_g4s2 <chr>,
# g4_DO_g4s3 <chr>, `g4_DO_g4EQ1 ` <chr>, g4_DO_g4EQ2 <chr>,
# g4_DO_g4s1 <chr>, g4_DO_g4EQ7 <chr>, rid <chr>, age <chr>, gender <chr>,
# hhi <chr>, ethnicity <chr>, hispanic <chr>, education <chr>,
# political_party <chr>, region <chr>, zip <chr>
# IV
# focus on g1_DO (Display Order: g#_DO) to find
# which treatment R received.
# Group 1's treatments are placed in the last part of the string,
# g1q3, g1q4, g1q5, g1q6
# what we'd like to do is to create a new variable
# that has four levels
# for the four treatments
# to use the str_detect() function, let's use stringr() package
library(stringr)
<- g4 %>%
g4_cleanmutate(treatment = case_when(
str_detect(g4_DO_g4s1, "1") ~ 1,
str_detect(g4_DO_g4s2, "1") ~ 2,
str_detect(g4_DO_g4s3, "1") ~ 3 ))
#Clean the data and rename the collum
library(dplyr)
<- g4_clean%>% select(-contains("g4_DO"))%>%
g4_finalrename("financial_support"= `g4EQ1 `, "career_development"="g4EQ2", "enough_time"= "g4EQ3", "arguments"="g4EQ4", "independent_child"="g4EQ5", "relax"= "g4EQ6", "reasonable"="g4EQ7")%>% na.omit()
view(g4_final)
head(g4_final)
# A tibble: 6 × 18
financial_support career_development enough_time arguments independent_child
<chr> <chr> <chr> <chr> <chr>
1 Strongly agree Strongly agree Strongly a… Somewhat… Somewhat agree
2 Somewhat agree Neither agree nor… Neither ag… Neither … Somewhat agree
3 Somewhat disagree Somewhat disagree Neither ag… Somewhat… Strongly disagree
4 Somewhat agree Somewhat agree Somewhat a… Somewhat… Somewhat agree
5 Neither agree nor … Somewhat agree Strongly a… Strongly… Strongly agree
6 Somewhat disagree Somewhat disagree Somewhat a… Somewhat… Neither agree no…
# ℹ 13 more variables: relax <chr>, reasonable <chr>, rid <chr>, age <chr>,
# gender <chr>, hhi <chr>, ethnicity <chr>, hispanic <chr>, education <chr>,
# political_party <chr>, region <chr>, zip <chr>, treatment <dbl>
# Here, I recoded our dependent variable questions to fit the Likert scale, allowing us to perform analytics on the respondents.
<- g4_final %>%
projectdata_recoded mutate(financial_support_num = case_when(financial_support == "Strongly disagree" ~ 1,
== "Somewhat disagree" ~ 2,
financial_support == "Neither agree nor disagree" ~ 3,
financial_support == "Somewhat agree" ~ 4,
financial_support == "Strongly agree" ~ 5),
financial_support career_development_num = case_when(career_development == "Strongly disagree" ~ 1,
== "Somewhat disagree" ~ 2,
career_development == "Neither agree nor disagree" ~ 3,
career_development == "Somewhat agree" ~ 4,
career_development == "Strongly agree" ~ 5),
career_development enough_time_num = case_when(enough_time == "Strongly disagree" ~ 1,
== "Somewhat disagree" ~ 2,
enough_time == "Neither agree nor disagree" ~ 3,
enough_time == "Somewhat agree" ~ 4,
enough_time == "Strongly agree" ~ 5),
enough_time arguments_num = case_when(arguments == "Strongly disagree" ~ 1,
== "Somewhat disagree" ~ 2,
arguments == "Neither agree nor disagree" ~ 3,
arguments == "Somewhat agree" ~ 4,
arguments == "Strongly agree" ~ 5),
arguments independent_child_num = case_when(independent_child == "Strongly disagree" ~ 1,
== "Somewhat disagree" ~ 2,
independent_child == "Neither agree nor disagree" ~ 3,
independent_child == "Somewhat agree" ~ 4,
independent_child == "Strongly agree" ~ 5),
independent_child relax_num = case_when(relax == "Strongly disagree" ~ 1,
== "Somewhat disagree" ~ 2,
relax == "Neither agree nor disagree" ~ 3,
relax == "Somewhat agree" ~ 4,
relax == "Strongly agree" ~ 5),
relax reasonable_num = case_when(reasonable == "Strongly disagree" ~ 1,
== "Somewhat disagree" ~ 2,
reasonable == "Neither agree nor disagree" ~ 3,
reasonable == "Somewhat agree" ~ 4,
reasonable == "Strongly agree" ~ 5))
reasonable
#Here, I recoded our demographic questions to fit the categorical variable, allowing us to perform analytics on the respondents.
$gender<- recode(projectdata_recoded$gender, "1"="Male", "2"= "Female")
projectdata_recoded
$education<- recode(projectdata_recoded$education, "1"= "Some high school or less", "2"= "High school graduate", "3"= "Other post high school vocational training", "4"= "Completed some college, but no degree", "5"= "Associate's degree", "6"= "Bachelor's degree", "7"= "Master's or professional degree", "8"= "Doctorate degree","-3105"= "None of the above")
projectdata_recoded
$ethnicity <- recode(projectdata_recoded$ethnicity, "1"= "White", "2"= "Black, or African American", "3"= "American Indian or Alaska Native", "4" = "Asian *** Asian Indian", "5"= "Asian *** Chinese", "6" = "Asian *** Filipino", "7"= "Asian *** Japanese", "8"= "Asian *** Korean", "9" = "Asian *** Vietnamese", "10"= "Asian *** Other", "11"="Pacific Islander *** Native Hawaiian", "12"= "Pacific Islander *** Guamanian", "13"= "Pacific Islander *** Samoan", "14"= "Pacific Islander *** Other Pacific Islander", "15"= "Some other race", "16"= "Prefer not to answer" )
projectdata_recoded
$hispanic <- recode(projectdata_recoded$hispanic,"1"= "No , not of Hispanic, Latino, or Spanish origin", "2"= "Yes, Mexican, Mexican American, Chicano", "3"= "Yes, Cuban", "4" = "Yes, another Hispanic, Latino, or Spanish origin *** Argentina", "5"= "Yes, another Hispanic, Latino, or Spanish origin *** Colombia", "6"= "Yes, another Hispanic, Latino, or Spanish origin *** Ecuador", "7"= "Yes, another Hispanic, Latino, or Spanish origin *** El Salvadore", "8"= "Yes, another Hispanic, Latino, or Spanish origin *** Guatemala", "9"= "Yes, another Hispanic, Latino, or Spanish origin *** Nicaragua", "10" = "Yes, another Hispanic, Latino, or Spanish origin *** Panama", "11"= "Yes, another Hispanic, Latino, or Spanish origin *** Peru", "12"= "Yes, another Hispanic, Latino, or Spanish origin *** Spain", "13"= "Yes, another Hispanic, Latino, or Spanish origin *** Venezuela", "14"= "Yes, another Hispanic, Latino, or Spanish origin *** Other Country", "15"= "Prefer not to answer")
projectdata_recoded
$hhi <- recode(projectdata_recoded$hhi, "1"= "Less than $14,999", "2"= "$15,000 to $19,999", "3"= "$20,000 to $24,999", "4"= "$25,000 to $29,999", "5" = "$30,000 to $34,999", "6" = "$35,000 to $39,999", "7"= "$40,000 to $44,999", "8"= "$45,000 to $49,999", "9"= "$50,000 to $54,999", "10"= "$55,000 to $59,999", "11"= "$60,000 to $64,999", "12"= "$65,000 to $69,999", "13"= "$70,000 to $74,999", "14"= "$75,000 to $79,999", "15"= "$80,000 to $84,999", "16"= "$85,000 to $89,999", "17"= "$90,000 to $94,999", "18"= "$95,000 to $99,999", "19"= "$100,000 to $124,999", "20"= "$125,000 to $149,999", "21"= "$150,000 to $174,999", "22"= "$175,000 to $199,999", "23"= "$200,000 to $249,999", "24"= "$250,000 and above", "-3105"= "Prefer not to answer")
projectdata_recoded
$political_party <- recode(projectdata_recoded$political_party, "1"= "Strong Democrat", "2"= "Not very strong Democrat", "3"= "Independent Democrat", "4"= "Independent - neither",
projectdata_recoded"5"= "Independent Republican", "6"= "Other - leaning Democrat", "7"= "Other - neither", "8"= "Other - leaning Republican", "9"= "Not very strong Republican", "10"= "Strong Republican")
$region <- recode(projectdata_recoded$region, "1"= "Northeast", "2"= "Midwest", "3"= "South", "4" = "West") projectdata_recoded