R Programming Data Wrangling Hands On Project

Libraries and Data Scoure

library(tidyverse)
library(hunspell)
library(RColorBrewer)
survey <- read.csv("~/Desktop/NCSU/Summer/AA500/R/Project/Get_to_know_survey_responses.csv")
knitr::opts_chunk$set(echo = FALSE)

Questions 1 and 2

Create a new column called Birth_Month which has only the month portion of Birth_Month_Year. Create a new column called Birth_Year which has only the year portion of Birth_Month_Year.

survey <- survey %>%
  rowwise() %>%
  mutate(Birth_Month=str_split(Birth_Month_Year,", ")[[1]][1])%>%
  mutate(Birth_Year=str_split(Birth_Month_Year,", ")[[1]][2]) %>% 
  mutate(Birth_Month = str_trim(Birth_Month))

Question 3

Since the birth month and birth year were open ended fields, there are many inconsistencies in the data. Standardize the values for Birth_Month and Birth_Year.

#Function to Convert Numbers to Months
full_month <- function(data){
  output <- month.name[data]
  return(output)
}

for(i in 1:length(survey$Birth_Month)){
  if(survey$Birth_Month[i] %in% month.name){
    next
  }else{
    survey$Birth_Month[i] <- full_month(as.numeric(survey$Birth_Month[i]))
  }
}

Question 4

Show the five most frequent birth months, and the five least frequent birth months.

Please note that many of the questions indicate showing the 5 most and 5 least frequent responses. Many of the categories resulted in tied values. Given the alphabetical sorting of the columns, the displayed results only reflect the first 5 outputs for the category.

Top 5 Birth Months

tot_bmonth <- survey %>%
  group_by(Birth_Month) %>%
  count(sort = TRUE)
head(tot_bmonth, n=5)
## # A tibble: 5 x 2
## # Groups:   Birth_Month [5]
##   Birth_Month     n
##   <chr>       <int>
## 1 April          14
## 2 July           13
## 3 November       13
## 4 May            12
## 5 September      12

Bottom 5 Birth Months

tail(tot_bmonth, n=5)
## # A tibble: 5 x 2
## # Groups:   Birth_Month [5]
##   Birth_Month     n
##   <chr>       <int>
## 1 October         8
## 2 August          7
## 3 December        7
## 4 January         7
## 5 March           5
graph <- survey %>%
  mutate(Birth_Month = factor(Birth_Month, levels = month.name)) %>%
  arrange(Birth_Month)

nb.cols <- 12
mycolors <- colorRampPalette(brewer.pal(8, "YlGnBu"))(nb.cols)

p = ggplot(data = graph, aes(x= Birth_Month, fill = Birth_Month))
p = p + geom_bar()
p = p + labs(title = "Number of Students Born per Month", y = "Number of Students", x = "Month")
p = p + theme(axis.text.x = element_text(angle = 60, hjust = 1))
p = p + scale_fill_manual(values = mycolors)
p  

Question 5

Show the five most frequent birth years, and the five least frequent birth year.

Top 5 Birth Years

tot_byear <- survey %>%
  group_by(Birth_Year) %>%
  count(sort = TRUE)
head(tot_byear, n=5)
## # A tibble: 5 x 2
## # Groups:   Birth_Year [5]
##   Birth_Year     n
##   <chr>      <int>
## 1 1999          23
## 2 1998          18
## 3 1996          15
## 4 1997          12
## 5 1995           9

Bottom 5 Birth Years

tail(tot_byear, n=5)
## # A tibble: 5 x 2
## # Groups:   Birth_Year [5]
##   Birth_Year     n
##   <chr>      <int>
## 1 1988           3
## 2 1990           3
## 3 1987           2
## 4 1975           1
## 5 1981           1
nb.cols <- 16
mycolors <- colorRampPalette(brewer.pal(8, "YlGnBu"))(nb.cols)

p1 = ggplot(data = survey, aes(x= Birth_Year, fill = Birth_Year))
p1 = p1 + geom_bar()
p1 = p1 + labs(title = "Number of Students Born per Year", y = "Number of Students", x = "Year")
p1 = p1 + theme(axis.text.x = element_text(angle = 60, hjust = 1))
p1 = p1 +  scale_fill_manual(values = mycolors)
p1 

Question 6

The State column was also open ended, so there are many inconsistencies. Clean this column to eliminate any non-US state, and to standardize the values for State.

#Function to Make State Names in Full Form
unabrev <- function(data){
  output <- state.name[grep(data, state.abb,ignore.case=T)]
  return(output)
}

#Loop to Check and Fix State Abbreviations
for(i in 1:length(survey$State)){
  survey$State[i] <- trimws(survey$State[i])
  if(is.na(survey$State[i])){
    survey$State[i] <- ""
  }
  if(survey$State[i] %in% state.abb){
    survey$State[i] <- unabrev(survey$State[i])
  }
}

#Fix State Misspellings 
state_dic <- dictionary(lang = "en_US", affix = NULL, add_words = state.name,
                        cache = TRUE)

#Function to Extract State Name
unstate <- function(input){
  output <- ""
  for(n in 1:length(input)){
    if(input[[1]][n] %in% state.name){
      output <- input[[1]][n]
    }
  }
  return(output)
}

#Loop to Check for Non State Names and Enter in the Function
for(y in 1:length(survey$State)){
  if(survey$State[y] %in% state.name){
    next
  }else{
    test <- hunspell_suggest(survey$State[y], dict=state_dic)
    survey$State[y] <- unstate(test)
  }
}

Question 7

Show the five most frequent states, and the five least frequent state.

Top 5 States

tot_state <- survey %>%
  group_by(State) %>%
  count(sort = TRUE)
tot_state <- tot_state[!apply(tot_state[,1] =="", 1, all),]
head(tot_state, n=5)
## # A tibble: 5 x 2
## # Groups:   State [5]
##   State              n
##   <chr>          <int>
## 1 North Carolina    68
## 2 New Jersey         5
## 3 New York           4
## 4 California         3
## 5 Pennsylvania       3

Bottom 5 States

tail(tot_state, n=5)
## # A tibble: 5 x 2
## # Groups:   State [5]
##   State         n
##   <chr>     <int>
## 1 Minnesota     1
## 2 Missouri      1
## 3 Tennessee     1
## 4 Texas         1
## 5 Wisconsin     1

Student’s Home State on US Map

States

Question 8

Similar to State, the column Country needs to be standardized. Once you standardize it, show the five most frequent countries, and the five least frequent countries.

Top 5 Countries

usvar <- c("USA", "US", "U.S.", "U.S.A.", "United States of America")
for(i in 1:length(survey$Country)){
  if(survey$Country[i] %in% usvar){
    survey$Country[i] <- "United States"
  }
  survey$Country[i] <- trimws(survey$Country[i])
}
survey <- survey %>% 
  mutate(Country = ifelse(str_detect(Country,"Durham") == TRUE,"United States", Country)) 


tot_country <- survey %>%
  group_by(Country) %>%
  count(sort = TRUE)
head(tot_country, n=5)
## # A tibble: 5 x 2
## # Groups:   Country [5]
##   Country           n
##   <chr>         <int>
## 1 United States    93
## 2 India             6
## 3 China             4
## 4 England           2
## 5 Russia            2

Bottom 5 Countries

tail(tot_country, n=5)
## # A tibble: 5 x 2
## # Groups:   Country [5]
##   Country     n
##   <chr>   <int>
## 1 Mexico      1
## 2 Nepal       1
## 3 Peru        1
## 4 Serbia      1
## 5 Spain       1
nb.cols <- 12
mycolors <- colorRampPalette(brewer.pal(11, "YlGnBu"))(nb.cols)

survey <- survey %>%
  rowwise() %>%
  mutate(Country = ifelse(str_detect(Country,"United States") == TRUE,NA, Country))

p2 = ggplot(data=subset(survey, !is.na(Country)), aes(x= Country, fill = Country)) + geom_bar() + labs(title = "Students and their Home Country not including US", y = "Number of Students", x = "Country")
p2 = p2 + theme(axis.text.x = element_text(angle = 60, hjust = 1))
p2 = p2 + scale_fill_manual(values = mycolors)
p2

Student’s Home Country on World Map

Countries

Question 9

We want to clean the How_Hear_About_Program column by eliminating the number and any extra character portion in those responses (so we keep only the text portion). Show the five most frequent responses to this column.

Please note that the written response “Friends” was added into the “Recommended by family/friends” category.

survey$How_Hear_About_Program <-str_remove_all(survey$How_Hear_About_Program, "[:digit:]. ")
survey <- survey %>% 
  mutate(How_Hear_About_Program = ifelse(str_detect(How_Hear_About_Program,"Friends") == TRUE,"Recommended by family/friends", How_Hear_About_Program))

hear_tot <- survey %>%
  group_by(How_Hear_About_Program) %>%
  count(sort = TRUE)

head(hear_tot, n=5)
## # A tibble: 5 x 2
## # Groups:   How_Hear_About_Program [5]
##   How_Hear_About_Program                       n
##   <chr>                                    <int>
## 1 Recommended by family/friends               53
## 2 Search engine                               28
## 3 Recommended by professors                   21
## 4 On social media (Twitter, Instagram, FB)     4
## 5 Recommendations found in blogs               2
graph1 <- survey %>%
  arrange(How_Hear_About_Program)

nb.cols <- 13
mycolors <- colorRampPalette(brewer.pal(8, "YlGnBu"))(nb.cols)

p = ggplot(data = graph1, aes(x= How_Hear_About_Program, fill = How_Hear_About_Program))
p = p + geom_bar()
p = p + labs(title = "How Students Heard about the Program", y = "Number of Students", x = "Responses")
p = p + theme(axis.text.x = element_text(angle = 60, hjust = 1))
p = p + scale_fill_manual(values = mycolors) + coord_flip()
p  

Question 10

We want to transform the data from wide to long on column Languages by duplicating the values of the other columns. You can use the comma as a separator for Languages. Show the five most frequent languages.

survey <- survey %>% 
  mutate(Languages = ifelse(str_detect(Languages,"Ru") == TRUE,"Russian", Languages)) %>% 
  mutate(Languages = ifelse(str_detect(Languages,"Spanish") == TRUE,"Spanish", Languages))

languagecount <- survey %>%
  separate(Languages, into= c("lang1","lang2", "lang3", "lang4"), sep=", ") %>% 
  gather(key= nothing, value = Languages, lang1:lang4, na.rm=T) %>% 
  select(-nothing) %>% 
  group_by(Languages) %>%
  count(sort = TRUE)

tail(languagecount, n=5)
## # A tibble: 5 x 2
## # Groups:   Languages [5]
##   Languages      n
##   <chr>      <int>
## 1 Nepali         1
## 2 Odia           1
## 3 Swahili        1
## 4 Urdu           1
## 5 Vietnamese     1
languagecount1 <- survey %>%
  separate(Languages, into= c("lang1","lang2", "lang3", "lang4"), sep=", ") %>% 
  gather(key= nothing, value = Languages, lang1:lang4, na.rm=T) %>% 
  select(-nothing)

graph2 <- languagecount1 %>%
  arrange(Languages)

nb.cols <- 19
mycolors <- colorRampPalette(brewer.pal(8, "YlGnBu"))(nb.cols)

p6 = ggplot(data = graph2, aes(x= Languages, fill = Languages))
p6 = p6 + geom_bar()
p6 = p6 + labs(title = "Languages Spoken by Students", y = "Number of Students", x = "Languages")
p6 = p6 + theme(axis.text.x = element_text(angle = 60, hjust = 1))
p6 = p6 + scale_fill_manual(values = mycolors) + coord_flip()
p6