# load datset
url <- "https://raw.githubusercontent.com/baroncurtin2/data607/master/project2/data/survey_data.csv"
survey <- read_csv(url, col_names = TRUE, col_types = NULL, na = c("", "NA")) %>%
tbl_df
## Parsed with column specification:
## cols(
## .default = col_integer()
## )
## See spec(...) for full column specifications.
# view data
kable(head(survey), caption = "Survey Data", format = "html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
survey %<>%
# gather all of the question columns
gather(key = "question", value = "response", -ptid) %>%
# rename ptid
rename(person_id = ptid) %>%
# remove Qs from question
mutate(question = str_replace(question, "Q", "")) %>%
# sort questions
arrange(question)
glimpse(survey)
## Observations: 520
## Variables: 3
## $ person_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ question <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1...
## $ response <int> 1, 5, 2, 3, 2, 2, 5, 3, 4, 2, 4, 1, 3, 2, 2, 1, 1, 2...
# group the data by question to get the mean for later analysis
grouped_survey <- survey %>%
group_by(question) %>%
summarise(meanQ = mean(response),
sd = sd(response),
abv_mean = sum(response > meanQ),
prop_abv_mean = abv_mean / n())
ggplot(survey, aes(x = question, y = response, color = question)) +
geom_boxplot() +
coord_flip() +
geom_hline(yintercept = mean(survey$response))
abv_mean <- survey %>%
filter(response > mean(response)) %>%
count
abv_mean/count(survey)
# analyse grouped data
grouped_survey %>%
filter(meanQ == max(meanQ))
grouped_survey %>%
filter(prop_abv_mean == max(prop_abv_mean))
# load datset
url <- "https://raw.githubusercontent.com/baroncurtin2/data607/master/project2/data/time_use.csv"
# rename headers
headers <- c("sex","country","total","personal","sleep","eating","other_personal","work_activities","work_travel","employ_activities","study","school_ex_hw","hw","free_study","household_care","food_mgmt","dish_wash","clean_house","house_upkeep","laundry","ironing","handicraft","gardening","tend_animals","pet_care","dog_walk","construction","shopping","childcare","teaching_child","house_mgmt","leisure_time","org_work","other_household_help","participation","feasts","other_social_life","culture","resting","walking_hiking","outdoor_activities","comp_games","computing","hobbies","reading_books","reading_other","tv_video","radio_music","leisure_other","leisure_travel","work_commute","travel_study","travel_shopping","child_chaperone","travel_other_household","travel_sociallife","travel_misc","misc")
# import into tibble
timeuse <- read_csv(url, col_names = headers, col_types = cols(.default = "c"), na = c("", "NA"), skip = 1) %>%
tbl_df()
# view data
kable(head(timeuse), caption = "Time Use Data", format = "html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
# helper function to get seconds
toMinutes <- function(x) {
hrs <- str_replace(x, '(\\d+):?(\\d+)', '\\1') %>%
as.numeric %>%
multiply_by(60)
mins <- str_replace(x, '(\\d+):?(\\d+)', '\\2') %>%
as.numeric
(hrs + mins)
}
# tidy operations
timeuse %<>%
# select * except total
select(-total) %>%
# mutate time columns to covert to minutes
mutate_at(.vars = vars(personal:misc), toMinutes) %>%
# gather all time columns
gather(key = "activity", value = "minutes", -(sex:country), na.rm = TRUE) %>%
# mutate germany in country column to say only germany
mutate(country = if_else(str_detect(country, '^(Germany).+'), 'Germany', country))
## Warning in function_list[[i]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning in function_list[[i]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning in function_list[[i]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning in function_list[[i]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning in function_list[[i]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning in function_list[[i]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning in function_list[[i]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning in function_list[[i]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
glimpse(timeuse)
## Observations: 1,522
## Variables: 4
## $ sex <chr> "Males", "Males", "Males", "Males", "Males", "Males",...
## $ country <chr> "Belgium", "Bulgaria", "Germany", "Estonia", "Spain",...
## $ activity <chr> "personal", "personal", "personal", "personal", "pers...
## $ minutes <dbl> 645, 714, 640, 635, 671, 704, 676, 646, 653, 644, 631...
sleep <- timeuse %>%
filter(activity == "sleep")
# sleep data by country
ggplot(sleep, aes(x = country, y = minutes)) +
geom_col(aes(fill = sex)) +
theme(axis.text.x = element_text(angle = 45, size = 8)) +
labs(title = 'Sleep Data')
# sleep data by sex
ggplot(sleep, aes(x = sex, y = minutes)) +
geom_col()+
labs(title = 'Sleep Data')
# calculate average sleep times for gender
sleep %>%
group_by(sex) %>%
summarise(avg = mean(minutes))
# get total sleep times
sleep %>%
group_by(country) %>%
summarise(total = sum(minutes)) %>%
arrange(desc(total)) %>%
head(5)
# load datset
url <- "https://raw.githubusercontent.com/baroncurtin2/data607/master/project2/data/population.csv"
census <- read_csv(url, col_names = TRUE, col_types = NULL, na = c("", "NA")) %>%
tbl_df
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_character()
## )
## See spec(...) for full column specifications.
# view data
kable(head(census), caption = "Population Data", format = "html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
exclude_list <- c("World", "Asia & Oceania", "Africa", "Europe", "Central & South America", "North America", "Eurasia", "Middle East")
census %<>%
# mutate X1 -> country
rename(country = X1) %>%
# gather all of the year columns
gather(key = "year", value = "population", -country) %>%
# filter aggregate countries
filter(!(country %in% exclude_list)) %>%
# replace NA and -- with 0
mutate(population = if_else(is.na(population) | (population == "--"), 0, as.double(population))) %>%
# sort by country name
arrange(country)
## Warning in replace_with(out, !condition, false, fmt_args(~false),
## glue("length of {fmt_args(~condition)}")): NAs introduced by coercion
glimpse(census)
## Observations: 6,944
## Variables: 3
## $ country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afgha...
## $ year <chr> "1980", "1981", "1982", "1983", "1984", "1985", "19...
## $ population <dbl> 15.04360, 13.67368, 12.57743, 12.43058, 12.75384, 1...
# subset data to only contain last 10 years
last10years <- census %>%
filter(year <= 2010 & year >= 2001) %>%
# group data
group_by(country) %>%
# calculate growth rate
mutate(growth_rate = (population/lag(population, 1)) - 1)
# top 5 countries by population in 2010
top5countries <- census %>%
filter(year == 2010) %>%
arrange(desc(population)) %>%
head(5)
# inner join to only get the top 5 countries' data
top5last10years <- inner_join(last10years, top5countries, by = "country") %>%
select(1:4) %>%
rename(year = year.x,
population = population.x)
# create lineplot of growth rates
ggplot(top5last10years, aes(x = year, y = growth_rate, color = country, group = country)) +
geom_line() +
scale_y_continuous(labels = scales::percent)
## Warning: Removed 5 rows containing missing values (geom_path).