The purpose of this project is to create the datasets used my Tableau dashboard: e-Cigarette Usage in U.S. Grade School Youth.
I selected R as my data preparation tool over SQL as the source was multiple Excel files and R excels in such data environments.
Data Preparation Process:
## libraries & functions ##########################################################################
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# '%!in%' <- function(x,y)!('%in%'(x,y))
## load files #####################################################################################
file.2019 <- cbind(Year = 2019, read.delim('data/2019.txt', stringsAsFactors = F))
file.2018 <- cbind(Year = 2018, read.delim('data/2018.txt', stringsAsFactors = F))
file.2017 <- cbind(Year = 2017, read.delim('data/2017.txt', stringsAsFactors = F))
file.2016 <- cbind(Year = 2016, read.delim('data/2016.txt', stringsAsFactors = F))
file.2015 <- cbind(Year = 2015, read.delim('data/2015.txt', stringsAsFactors = F))
file.2014 <- cbind(Year = 2014, read.delim('data/2014.txt', stringsAsFactors = F))
## create basic dataset 1 of 2: have you ever? ####################################################
haveyouever <- rbind(file.2019 %>% select(StudentLoginID, Year, Q2, Q3, Q34) %>% rename(Student = StudentLoginID, Sex = Q2, Grade = Q3, Answer = Q34),
file.2018 %>% select(student , Year, Q2, Q3, Q28) %>% rename(Student = student , Sex = Q2, Grade = Q3, Answer = Q28),
file.2017 %>% select(student , Year, Q2, Q3, Q28) %>% rename(Student = student , Sex = Q2, Grade = Q3, Answer = Q28),
file.2016 %>% select(student , Year, Q2, Q3, Q26) %>% rename(Student = student , Sex = Q2, Grade = Q3, Answer = Q26),
file.2015 %>% select(STUDENT , Year, q2, q3, q28) %>% rename(Student = STUDENT , Sex = q2, Grade = q3, Answer = q28),
file.2014 %>% select(StudntID , Year, qn2, qn3, qn31) %>% rename(Student = StudntID , Sex = qn2, Grade = qn3, Answer = qn31))
haveyouever <- haveyouever %>% mutate(Question = "Ever") %>% select(Student, Year, Sex, Grade, Question, Answer)
haveyouever$Answer <- case_when( haveyouever$Answer == "1" ~ "Yes",
haveyouever$Answer == "2" ~ "No",
TRUE ~ "N/A"
)
haveyouever$Grade <- case_when( haveyouever$Grade == "1" ~ "6th Grade",
haveyouever$Grade == "2" ~ "7th Grade",
haveyouever$Grade == "3" ~ "8th Grade",
haveyouever$Grade == "4" ~ "9th Grade",
haveyouever$Grade == "5" ~ "10th Grade",
haveyouever$Grade == "6" ~ "11th Grade",
haveyouever$Grade == "7" ~ "12th Grade",
TRUE ~ "N/A"
)
haveyouever$Sex <- case_when( haveyouever$Sex == "1" ~ "Male",
haveyouever$Sex == "2" ~ "Female",
TRUE ~ "Unknown"
)
haveyouever <- haveyouever %>% filter(Grade != "N/A" & Answer != "N/A") # 1) only those who responded and 2) grades 6 - 12
## create basic dataset 2 of 2: 2019 multiple choice questions ####################################
data.2019 <- file.2019 %>%
# Q34 (Have you ever used an e-cigarette, even once or twice?) must be "1", meaning yes, to have answered questions in this dataset
filter(Q34 == "1") %>%
# select relevant columns
select(Year, StudentLoginID, Q2, Q3, Q34,
Q41A, Q41B, Q41C, Q41D, Q41E, Q41F, Q41G, Q41H, Q41I, Q41J,
Q42A, Q42B, Q42C, Q42D, Q42E, Q42F, Q42G, Q42H, Q42I, Q42J, Q42K, Q42L) %>%
# rename common aggregators
rename(Student = StudentLoginID, Sex = Q2, Grade = Q3, Ever = Q34) %>%
# remove respondants not in grades 6 to 12
filter(Grade != "8" & Grade != ".N") %>%
# Rename Q41 sub-questions
rename(WhereGas = Q41A, WhereGrocery = Q41B, WhereDrugstore = Q41C, WhereKiosk = Q41D, WhereInternet = Q41E,
WhereVape = Q41F, WhereOtherPlace = Q41G, WhereFamily = Q41H, WhereFriend = Q41I, WhereOtherPerson = Q41J) %>%
# Rename Q42 sub-questions
rename(WhyFriendFam = Q42A, WhyQuitTobacco = Q42B, WhyCost = Q42C, WhyEasier = Q42D, WhyMedia = Q42E, WhyHarm = Q42F,
WhyFlavor = Q42G, WhyUnnoticed = Q42H, WhyPeers = Q42I, WhyTricks = Q42J, WhyCurious = Q42K, WhyOther = Q42L)
data.2019$Grade <- case_when( data.2019$Grade == "1" ~ "6th Grade",
data.2019$Grade == "2" ~ "7th Grade",
data.2019$Grade == "3" ~ "8th Grade",
data.2019$Grade == "4" ~ "9th Grade",
data.2019$Grade == "5" ~ "10th Grade",
data.2019$Grade == "6" ~ "11th Grade",
data.2019$Grade == "7" ~ "12th Grade",
TRUE ~ "N/A"
)
data.2019$Sex <- case_when( data.2019$Sex == "1" ~ "Male",
data.2019$Sex == "2" ~ "Female",
TRUE ~ "Unknown"
)
## create yes response data to reason and where questions #########################################
reason <- rbind(data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyFriendFam, Answer = "Friend or family member used them"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyQuitTobacco, Answer = "To try to quit using other tobacco products, such as cigarettes"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyCost, Answer = "They cost less than other tobacco products, such as cigarettes"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyEasier, Answer = "They are easier to get than other tobacco products, such as cigarettes"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyMedia, Answer = "I've seen people on TV, online, or in movies use them"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyHarm, Answer = "They are less harmful than other forms of tobacco, such as cigarettes"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyFlavor, Answer = "They are available in flavors, such as mint, candy, fruit, or chocolate"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyUnnoticed, Answer = "I can use them unnoticed at home or at school"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyPeers, Answer = "I was peer pressured into using them"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyTricks, Answer = "I can use them to do tricks"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyCurious, Answer = "I was curious about them"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyOther, Answer = "I used them for some other reason")
)
# only take the yes's
reason <- reason %>% filter(AnswerValue == "1") %>% mutate(Question = "Reason") %>% select(Student, Year, Sex, Grade, Question, Answer)
where <- rbind(data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereGas, Answer = "A gas station or convenience store"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereGrocery, Answer = "A grocery store"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereDrugstore, Answer = "A drugstore"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereKiosk, Answer = "A mall or shopping center kiosk/stand"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereInternet, Answer = "On the Internet"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereVape, Answer = "A vape shop or other store that only sells e-cigarettes"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereOtherPlace, Answer = "Some other place not listed here"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereFamily, Answer = "From a family member"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereFriend, Answer = "From a friend"),
data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereOtherPerson, Answer = "From some other person that is not a family member or a friend")
)
# only take the yes's
where <- where %>% filter(AnswerValue == "1") %>% mutate(Question = "Where") %>% select(Student, Year, Sex, Grade, Question, Answer)
## create final datasets for output ###############################################################
haveyouever.denominator <- haveyouever %>% group_by(Year, Sex, Grade) %>% summarise(Denominator = n_distinct(Student))
haveyouever.numerator <- haveyouever %>% group_by(Year, Sex, Grade, Question, Answer) %>% summarise(Numerator = n_distinct(Student))
#haveyouever.numerator <- haveyouever %>% filter(Answer == "Yes") %>% group_by(Year, Sex, Grade, Question, Answer) %>% summarise(Numerator = n_distinct(Student))
haveyouever.quotient <- haveyouever.numerator %>% left_join(haveyouever.denominator, by = c("Year", "Sex", "Grade")) %>% arrange(desc(Year),desc(Grade),Sex)
multichoice.denominator <- data.2019 %>% group_by(Year, Sex, Grade) %>% summarise(Denominator = n_distinct(Student))
multichoice.numerator <- rbind(reason, where) %>% group_by(Year, Sex, Grade, Question, Answer) %>% summarise(Numerator = n_distinct(Student))
multichoice.quotient <- multichoice.numerator %>% left_join(multichoice.denominator, by = c("Year", "Sex", "Grade")) %>% arrange(desc(Year),desc(Grade),Sex)
multichoice.allgrades <- multichoice.quotient %>%
group_by(Year, Sex, Question, Answer) %>%
summarise(Numerator = sum(Numerator), Denominator = sum(Denominator)) %>%
mutate(Grade = "All") %>%
select(Year, Sex, Grade, Question, Answer, Numerator, Denominator)
multichoice.quotient <- rbind(multichoice.quotient, multichoice.allgrades)
## output to csv file #############################################################################
write.csv(rbind(haveyouever.quotient,
multichoice.quotient),
file = "NYTS.csv",
row.names = F)