library(tidyverse)
library(hunspell)
library(RColorBrewer)
survey <- read.csv("~/Desktop/NCSU/Summer/AA500/R/Project/Get_to_know_survey_responses.csv")
knitr::opts_chunk$set(echo = FALSE)
Create a new column called Birth_Month which has only the month portion of Birth_Month_Year. Create a new column called Birth_Year which has only the year portion of Birth_Month_Year.
survey <- survey %>%
rowwise() %>%
mutate(Birth_Month=str_split(Birth_Month_Year,", ")[[1]][1])%>%
mutate(Birth_Year=str_split(Birth_Month_Year,", ")[[1]][2]) %>%
mutate(Birth_Month = str_trim(Birth_Month))
Since the birth month and birth year were open ended fields, there are many inconsistencies in the data. Standardize the values for Birth_Month and Birth_Year.
#Function to Convert Numbers to Months
full_month <- function(data){
output <- month.name[data]
return(output)
}
for(i in 1:length(survey$Birth_Month)){
if(survey$Birth_Month[i] %in% month.name){
next
}else{
survey$Birth_Month[i] <- full_month(as.numeric(survey$Birth_Month[i]))
}
}
Show the five most frequent birth months, and the five least frequent birth months.
Please note that many of the questions indicate showing the 5 most and 5 least frequent responses. Many of the categories resulted in tied values. Given the alphabetical sorting of the columns, the displayed results only reflect the first 5 outputs for the category.
tot_bmonth <- survey %>%
group_by(Birth_Month) %>%
count(sort = TRUE)
head(tot_bmonth, n=5)
## # A tibble: 5 x 2
## # Groups: Birth_Month [5]
## Birth_Month n
## <chr> <int>
## 1 April 14
## 2 July 13
## 3 November 13
## 4 May 12
## 5 September 12
tail(tot_bmonth, n=5)
## # A tibble: 5 x 2
## # Groups: Birth_Month [5]
## Birth_Month n
## <chr> <int>
## 1 October 8
## 2 August 7
## 3 December 7
## 4 January 7
## 5 March 5
graph <- survey %>%
mutate(Birth_Month = factor(Birth_Month, levels = month.name)) %>%
arrange(Birth_Month)
nb.cols <- 12
mycolors <- colorRampPalette(brewer.pal(8, "YlGnBu"))(nb.cols)
p = ggplot(data = graph, aes(x= Birth_Month, fill = Birth_Month))
p = p + geom_bar()
p = p + labs(title = "Number of Students Born per Month", y = "Number of Students", x = "Month")
p = p + theme(axis.text.x = element_text(angle = 60, hjust = 1))
p = p + scale_fill_manual(values = mycolors)
p
Show the five most frequent birth years, and the five least frequent birth year.
tot_byear <- survey %>%
group_by(Birth_Year) %>%
count(sort = TRUE)
head(tot_byear, n=5)
## # A tibble: 5 x 2
## # Groups: Birth_Year [5]
## Birth_Year n
## <chr> <int>
## 1 1999 23
## 2 1998 18
## 3 1996 15
## 4 1997 12
## 5 1995 9
tail(tot_byear, n=5)
## # A tibble: 5 x 2
## # Groups: Birth_Year [5]
## Birth_Year n
## <chr> <int>
## 1 1988 3
## 2 1990 3
## 3 1987 2
## 4 1975 1
## 5 1981 1
nb.cols <- 16
mycolors <- colorRampPalette(brewer.pal(8, "YlGnBu"))(nb.cols)
p1 = ggplot(data = survey, aes(x= Birth_Year, fill = Birth_Year))
p1 = p1 + geom_bar()
p1 = p1 + labs(title = "Number of Students Born per Year", y = "Number of Students", x = "Year")
p1 = p1 + theme(axis.text.x = element_text(angle = 60, hjust = 1))
p1 = p1 + scale_fill_manual(values = mycolors)
p1
The State column was also open ended, so there are many inconsistencies. Clean this column to eliminate any non-US state, and to standardize the values for State.
#Function to Make State Names in Full Form
unabrev <- function(data){
output <- state.name[grep(data, state.abb,ignore.case=T)]
return(output)
}
#Loop to Check and Fix State Abbreviations
for(i in 1:length(survey$State)){
survey$State[i] <- trimws(survey$State[i])
if(is.na(survey$State[i])){
survey$State[i] <- ""
}
if(survey$State[i] %in% state.abb){
survey$State[i] <- unabrev(survey$State[i])
}
}
#Fix State Misspellings
state_dic <- dictionary(lang = "en_US", affix = NULL, add_words = state.name,
cache = TRUE)
#Function to Extract State Name
unstate <- function(input){
output <- ""
for(n in 1:length(input)){
if(input[[1]][n] %in% state.name){
output <- input[[1]][n]
}
}
return(output)
}
#Loop to Check for Non State Names and Enter in the Function
for(y in 1:length(survey$State)){
if(survey$State[y] %in% state.name){
next
}else{
test <- hunspell_suggest(survey$State[y], dict=state_dic)
survey$State[y] <- unstate(test)
}
}
Show the five most frequent states, and the five least frequent state.
tot_state <- survey %>%
group_by(State) %>%
count(sort = TRUE)
tot_state <- tot_state[!apply(tot_state[,1] =="", 1, all),]
head(tot_state, n=5)
## # A tibble: 5 x 2
## # Groups: State [5]
## State n
## <chr> <int>
## 1 North Carolina 68
## 2 New Jersey 5
## 3 New York 4
## 4 California 3
## 5 Pennsylvania 3
tail(tot_state, n=5)
## # A tibble: 5 x 2
## # Groups: State [5]
## State n
## <chr> <int>
## 1 Minnesota 1
## 2 Missouri 1
## 3 Tennessee 1
## 4 Texas 1
## 5 Wisconsin 1
States
Similar to State, the column Country needs to be standardized. Once you standardize it, show the five most frequent countries, and the five least frequent countries.
usvar <- c("USA", "US", "U.S.", "U.S.A.", "United States of America")
for(i in 1:length(survey$Country)){
if(survey$Country[i] %in% usvar){
survey$Country[i] <- "United States"
}
survey$Country[i] <- trimws(survey$Country[i])
}
survey <- survey %>%
mutate(Country = ifelse(str_detect(Country,"Durham") == TRUE,"United States", Country))
tot_country <- survey %>%
group_by(Country) %>%
count(sort = TRUE)
head(tot_country, n=5)
## # A tibble: 5 x 2
## # Groups: Country [5]
## Country n
## <chr> <int>
## 1 United States 93
## 2 India 6
## 3 China 4
## 4 England 2
## 5 Russia 2
tail(tot_country, n=5)
## # A tibble: 5 x 2
## # Groups: Country [5]
## Country n
## <chr> <int>
## 1 Mexico 1
## 2 Nepal 1
## 3 Peru 1
## 4 Serbia 1
## 5 Spain 1
nb.cols <- 12
mycolors <- colorRampPalette(brewer.pal(11, "YlGnBu"))(nb.cols)
survey <- survey %>%
rowwise() %>%
mutate(Country = ifelse(str_detect(Country,"United States") == TRUE,NA, Country))
p2 = ggplot(data=subset(survey, !is.na(Country)), aes(x= Country, fill = Country)) + geom_bar() + labs(title = "Students and their Home Country not including US", y = "Number of Students", x = "Country")
p2 = p2 + theme(axis.text.x = element_text(angle = 60, hjust = 1))
p2 = p2 + scale_fill_manual(values = mycolors)
p2
We want to clean the How_Hear_About_Program column by eliminating the number and any extra character portion in those responses (so we keep only the text portion). Show the five most frequent responses to this column.
Please note that the written response “Friends” was added into the “Recommended by family/friends” category.
survey$How_Hear_About_Program <-str_remove_all(survey$How_Hear_About_Program, "[:digit:]. ")
survey <- survey %>%
mutate(How_Hear_About_Program = ifelse(str_detect(How_Hear_About_Program,"Friends") == TRUE,"Recommended by family/friends", How_Hear_About_Program))
hear_tot <- survey %>%
group_by(How_Hear_About_Program) %>%
count(sort = TRUE)
head(hear_tot, n=5)
## # A tibble: 5 x 2
## # Groups: How_Hear_About_Program [5]
## How_Hear_About_Program n
## <chr> <int>
## 1 Recommended by family/friends 53
## 2 Search engine 28
## 3 Recommended by professors 21
## 4 On social media (Twitter, Instagram, FB) 4
## 5 Recommendations found in blogs 2
graph1 <- survey %>%
arrange(How_Hear_About_Program)
nb.cols <- 13
mycolors <- colorRampPalette(brewer.pal(8, "YlGnBu"))(nb.cols)
p = ggplot(data = graph1, aes(x= How_Hear_About_Program, fill = How_Hear_About_Program))
p = p + geom_bar()
p = p + labs(title = "How Students Heard about the Program", y = "Number of Students", x = "Responses")
p = p + theme(axis.text.x = element_text(angle = 60, hjust = 1))
p = p + scale_fill_manual(values = mycolors) + coord_flip()
p
We want to transform the data from wide to long on column Languages by duplicating the values of the other columns. You can use the comma as a separator for Languages. Show the five most frequent languages.
survey <- survey %>%
mutate(Languages = ifelse(str_detect(Languages,"Ru") == TRUE,"Russian", Languages)) %>%
mutate(Languages = ifelse(str_detect(Languages,"Spanish") == TRUE,"Spanish", Languages))
languagecount <- survey %>%
separate(Languages, into= c("lang1","lang2", "lang3", "lang4"), sep=", ") %>%
gather(key= nothing, value = Languages, lang1:lang4, na.rm=T) %>%
select(-nothing) %>%
group_by(Languages) %>%
count(sort = TRUE)
tail(languagecount, n=5)
## # A tibble: 5 x 2
## # Groups: Languages [5]
## Languages n
## <chr> <int>
## 1 Nepali 1
## 2 Odia 1
## 3 Swahili 1
## 4 Urdu 1
## 5 Vietnamese 1
languagecount1 <- survey %>%
separate(Languages, into= c("lang1","lang2", "lang3", "lang4"), sep=", ") %>%
gather(key= nothing, value = Languages, lang1:lang4, na.rm=T) %>%
select(-nothing)
graph2 <- languagecount1 %>%
arrange(Languages)
nb.cols <- 19
mycolors <- colorRampPalette(brewer.pal(8, "YlGnBu"))(nb.cols)
p6 = ggplot(data = graph2, aes(x= Languages, fill = Languages))
p6 = p6 + geom_bar()
p6 = p6 + labs(title = "Languages Spoken by Students", y = "Number of Students", x = "Languages")
p6 = p6 + theme(axis.text.x = element_text(angle = 60, hjust = 1))
p6 = p6 + scale_fill_manual(values = mycolors) + coord_flip()
p6