The purpose of this project is to create the datasets used my Tableau dashboard: e-Cigarette Usage in U.S. Grade School Youth.

I selected R as my data preparation tool over SQL as the source was multiple Excel files and R excels in such data environments.

Data Preparation Process:

  1. Download 2014 to 2019 files from the CDC’s site
  2. Run the files through this R program
  3. Post output to ..\My Tableau Repository\Datasources
  4. Develop and post the Tableau dashboard
## libraries & functions ##########################################################################

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# '%!in%' <- function(x,y)!('%in%'(x,y))

## load files #####################################################################################

file.2019 <- cbind(Year = 2019, read.delim('data/2019.txt', stringsAsFactors = F))
file.2018 <- cbind(Year = 2018, read.delim('data/2018.txt', stringsAsFactors = F))
file.2017 <- cbind(Year = 2017, read.delim('data/2017.txt', stringsAsFactors = F))
file.2016 <- cbind(Year = 2016, read.delim('data/2016.txt', stringsAsFactors = F))
file.2015 <- cbind(Year = 2015, read.delim('data/2015.txt', stringsAsFactors = F))
file.2014 <- cbind(Year = 2014, read.delim('data/2014.txt', stringsAsFactors = F))

## create basic dataset 1 of 2: have you ever? ####################################################

haveyouever <- rbind(file.2019 %>% select(StudentLoginID, Year, Q2,  Q3,  Q34)  %>% rename(Student = StudentLoginID, Sex = Q2,  Grade = Q3,  Answer = Q34),
                     file.2018 %>% select(student       , Year, Q2,  Q3,  Q28)  %>% rename(Student = student       , Sex = Q2,  Grade = Q3,  Answer = Q28),
                     file.2017 %>% select(student       , Year, Q2,  Q3,  Q28)  %>% rename(Student = student       , Sex = Q2,  Grade = Q3,  Answer = Q28),
                     file.2016 %>% select(student       , Year, Q2,  Q3,  Q26)  %>% rename(Student = student       , Sex = Q2,  Grade = Q3,  Answer = Q26),
                     file.2015 %>% select(STUDENT       , Year, q2,  q3,  q28)  %>% rename(Student = STUDENT       , Sex = q2,  Grade = q3,  Answer = q28),
                     file.2014 %>% select(StudntID      , Year, qn2, qn3, qn31) %>% rename(Student = StudntID      , Sex = qn2, Grade = qn3, Answer = qn31))

haveyouever <- haveyouever %>% mutate(Question = "Ever") %>% select(Student, Year, Sex, Grade, Question, Answer)

haveyouever$Answer  <- case_when( haveyouever$Answer == "1" ~ "Yes",
                                  haveyouever$Answer == "2" ~ "No",
                                  TRUE ~ "N/A"
)

haveyouever$Grade <- case_when( haveyouever$Grade == "1" ~ "6th Grade",
                                haveyouever$Grade == "2" ~ "7th Grade",
                                haveyouever$Grade == "3" ~ "8th Grade",
                                haveyouever$Grade == "4" ~ "9th Grade",
                                haveyouever$Grade == "5" ~ "10th Grade",
                                haveyouever$Grade == "6" ~ "11th Grade",
                                haveyouever$Grade == "7" ~ "12th Grade",
                                TRUE ~ "N/A"
)

haveyouever$Sex   <- case_when( haveyouever$Sex == "1" ~ "Male",
                                haveyouever$Sex == "2" ~ "Female",
                                TRUE ~ "Unknown"
)

haveyouever <- haveyouever %>% filter(Grade != "N/A" & Answer != "N/A") # 1) only those who responded and 2) grades 6 - 12


## create basic dataset 2 of 2: 2019 multiple choice questions ####################################

data.2019 <- file.2019 %>% 
  
  # Q34 (Have you ever used an e-cigarette, even once or twice?) must be "1", meaning yes, to have answered questions in this dataset
  filter(Q34 == "1") %>% 
  
  # select relevant columns
  select(Year, StudentLoginID, Q2, Q3, Q34, 
         Q41A, Q41B, Q41C, Q41D, Q41E, Q41F, Q41G, Q41H, Q41I, Q41J,
         Q42A, Q42B, Q42C, Q42D, Q42E, Q42F, Q42G, Q42H, Q42I, Q42J, Q42K, Q42L) %>% 
  
  # rename common aggregators
  rename(Student = StudentLoginID, Sex = Q2, Grade = Q3, Ever = Q34) %>% 
  
  # remove respondants not in grades 6 to 12
  filter(Grade != "8" & Grade != ".N") %>% 
  
  # Rename Q41 sub-questions
  rename(WhereGas = Q41A, WhereGrocery = Q41B, WhereDrugstore = Q41C, WhereKiosk = Q41D, WhereInternet = Q41E, 
         WhereVape = Q41F, WhereOtherPlace = Q41G, WhereFamily = Q41H, WhereFriend = Q41I, WhereOtherPerson = Q41J) %>% 
  
  # Rename Q42 sub-questions
  rename(WhyFriendFam = Q42A, WhyQuitTobacco = Q42B, WhyCost = Q42C, WhyEasier = Q42D, WhyMedia = Q42E, WhyHarm = Q42F, 
         WhyFlavor = Q42G, WhyUnnoticed = Q42H, WhyPeers = Q42I, WhyTricks = Q42J, WhyCurious = Q42K, WhyOther = Q42L)

data.2019$Grade <- case_when( data.2019$Grade == "1" ~ "6th Grade",
                              data.2019$Grade == "2" ~ "7th Grade",
                              data.2019$Grade == "3" ~ "8th Grade",
                              data.2019$Grade == "4" ~ "9th Grade",
                              data.2019$Grade == "5" ~ "10th Grade",
                              data.2019$Grade == "6" ~ "11th Grade",
                              data.2019$Grade == "7" ~ "12th Grade",
                              TRUE ~ "N/A"
)

data.2019$Sex   <- case_when( data.2019$Sex == "1" ~ "Male",
                              data.2019$Sex == "2" ~ "Female",
                              TRUE ~ "Unknown"
)

## create yes response data to reason and where questions #########################################

reason <- rbind(data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyFriendFam, Answer = "Friend or family member used them"),
                
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyQuitTobacco, Answer = "To try to quit using other tobacco products, such as cigarettes"),
                
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyCost, Answer = "They cost less than other tobacco products, such as cigarettes"),
                
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyEasier, Answer = "They are easier to get than other tobacco products, such as cigarettes"),
                
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyMedia, Answer = "I've seen people on TV, online, or in movies use them"),
                
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyHarm, Answer = "They are less harmful than other forms of tobacco, such as cigarettes"),
                
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyFlavor, Answer = "They are available in flavors, such as mint, candy, fruit, or chocolate"),
                
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyUnnoticed, Answer = "I can use them unnoticed at home or at school"),
                
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyPeers, Answer = "I was peer pressured into using them"),
                
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyTricks, Answer = "I can use them to do tricks"),
                
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyCurious, Answer = "I was curious about them"),
                
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhyOther, Answer = "I used them for some other reason")
)
# only take the yes's
reason <- reason %>% filter(AnswerValue == "1") %>% mutate(Question = "Reason") %>% select(Student, Year, Sex, Grade, Question, Answer)

where <-  rbind(data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereGas, Answer = "A gas station or convenience store"),
               
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereGrocery, Answer = "A grocery store"),
               
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereDrugstore, Answer = "A drugstore"),
               
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereKiosk, Answer = "A mall or shopping center kiosk/stand"),
               
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereInternet, Answer = "On the Internet"),
               
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereVape, Answer = "A vape shop or other store that only sells e-cigarettes"),
               
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereOtherPlace, Answer = "Some other place not listed here"),
               
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereFamily, Answer = "From a family member"),
               
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereFriend, Answer = "From a friend"),
               
                data.frame(stringsAsFactors = F, Student = data.2019$Student, Year = data.2019$Year, Sex = data.2019$Sex, Grade = data.2019$Grade, AnswerValue = data.2019$WhereOtherPerson, Answer = "From some other person that is not a family member or a friend")
)

# only take the yes's
where <- where %>% filter(AnswerValue == "1") %>% mutate(Question = "Where") %>% select(Student, Year, Sex, Grade, Question, Answer)

## create final datasets for output ###############################################################

haveyouever.denominator <- haveyouever %>% group_by(Year, Sex, Grade) %>% summarise(Denominator = n_distinct(Student))
haveyouever.numerator   <- haveyouever %>% group_by(Year, Sex, Grade, Question, Answer) %>% summarise(Numerator = n_distinct(Student))
#haveyouever.numerator   <- haveyouever %>% filter(Answer == "Yes") %>% group_by(Year, Sex, Grade, Question, Answer) %>% summarise(Numerator = n_distinct(Student))
haveyouever.quotient    <- haveyouever.numerator %>% left_join(haveyouever.denominator, by = c("Year", "Sex", "Grade")) %>% arrange(desc(Year),desc(Grade),Sex)

multichoice.denominator <- data.2019 %>% group_by(Year, Sex, Grade) %>% summarise(Denominator = n_distinct(Student))
multichoice.numerator   <- rbind(reason, where) %>% group_by(Year, Sex, Grade, Question, Answer) %>% summarise(Numerator = n_distinct(Student))
multichoice.quotient    <- multichoice.numerator %>% left_join(multichoice.denominator, by = c("Year", "Sex", "Grade")) %>% arrange(desc(Year),desc(Grade),Sex) 
multichoice.allgrades   <- multichoice.quotient %>% 
                           group_by(Year, Sex, Question, Answer) %>% 
                           summarise(Numerator = sum(Numerator), Denominator = sum(Denominator)) %>%
                           mutate(Grade = "All") %>%
                           select(Year, Sex, Grade, Question, Answer, Numerator, Denominator)
multichoice.quotient    <- rbind(multichoice.quotient, multichoice.allgrades)


## output to csv file #############################################################################

write.csv(rbind(haveyouever.quotient,
                multichoice.quotient), 
          file = "NYTS.csv",
          row.names = F)