Datasets

Survey Data

Objectives:
- Tidy datset
- Make the data more narrow, i.e. convert each question to a row
- Analyze the distributions via boxplot

# load datset
url <- "https://raw.githubusercontent.com/baroncurtin2/data607/master/project2/data/survey_data.csv"

survey <- read_csv(url, col_names = TRUE, col_types = NULL, na = c("", "NA")) %>%
  tbl_df

## Parsed with column specification:
## cols(
##   .default = col_integer()
## )

## See spec(...) for full column specifications.

# view data
kable(head(survey), caption = "Survey Data", format = "html") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Tidy our Survey Data

survey %<>%
  # gather all of the question columns
  gather(key = "question", value = "response", -ptid) %>%
  # rename ptid
  rename(person_id = ptid) %>%
  # remove Qs from question
  mutate(question = str_replace(question, "Q", "")) %>%
  # sort questions
  arrange(question)

glimpse(survey)

## Observations: 520
## Variables: 3
## $ person_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ question  <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1...
## $ response  <int> 1, 5, 2, 3, 2, 2, 5, 3, 4, 2, 4, 1, 3, 2, 2, 1, 1, 2...

# group the data by question to get the mean for later analysis
grouped_survey <- survey %>%
  group_by(question) %>%
  summarise(meanQ = mean(response), 
            sd = sd(response), 
            abv_mean = sum(response > meanQ),
            prop_abv_mean = abv_mean / n())

Visualize Survey Data

ggplot(survey, aes(x = question, y = response, color = question)) +
  geom_boxplot() +
  coord_flip() +
  geom_hline(yintercept = mean(survey$response))

Conclusions

abv_mean <- survey %>%
  filter(response > mean(response)) %>%
  count

abv_mean/count(survey)

# analyse grouped data
grouped_survey %>%
  filter(meanQ == max(meanQ))

grouped_survey %>%
  filter(prop_abv_mean == max(prop_abv_mean))

51.9% of all responses are above the mean of the typical response across all questions
Question 8b had the highest mean response value at 3.45
Questions 2 and 6e had the highest proportion of responses above their respective means

Time Use Data

Objectives:
- Tidy datset
- Make the data more narrow, i.e. convert each activity to a row
- Analyze how males vs females spend their time
- Analyze how each countries males and females compare to each other

# load datset
url <- "https://raw.githubusercontent.com/baroncurtin2/data607/master/project2/data/time_use.csv"
# rename headers
headers <- c("sex","country","total","personal","sleep","eating","other_personal","work_activities","work_travel","employ_activities","study","school_ex_hw","hw","free_study","household_care","food_mgmt","dish_wash","clean_house","house_upkeep","laundry","ironing","handicraft","gardening","tend_animals","pet_care","dog_walk","construction","shopping","childcare","teaching_child","house_mgmt","leisure_time","org_work","other_household_help","participation","feasts","other_social_life","culture","resting","walking_hiking","outdoor_activities","comp_games","computing","hobbies","reading_books","reading_other","tv_video","radio_music","leisure_other","leisure_travel","work_commute","travel_study","travel_shopping","child_chaperone","travel_other_household","travel_sociallife","travel_misc","misc")

# import into tibble
timeuse <- read_csv(url, col_names = headers, col_types = cols(.default = "c"), na = c("", "NA"), skip = 1) %>%
  tbl_df()

# view data
kable(head(timeuse), caption = "Time Use Data", format = "html") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Tidy Timeuse Data

# helper function to get seconds
toMinutes <- function(x) {
  hrs <- str_replace(x, '(\\d+):?(\\d+)', '\\1') %>%
    as.numeric %>%
    multiply_by(60)
  mins <- str_replace(x, '(\\d+):?(\\d+)', '\\2') %>%
    as.numeric
  
  (hrs + mins)
}

# tidy operations
timeuse %<>%
  # select * except total
  select(-total) %>%
  # mutate time columns to covert to minutes
  mutate_at(.vars = vars(personal:misc), toMinutes) %>%
  # gather all time columns
  gather(key = "activity", value = "minutes", -(sex:country), na.rm = TRUE) %>%
  # mutate germany in country column to say only germany
  mutate(country = if_else(str_detect(country, '^(Germany).+'), 'Germany', country))

## Warning in function_list[[i]](value): NAs introduced by coercion

## Warning in function_list[[k]](value): NAs introduced by coercion

## Warning in function_list[[i]](value): NAs introduced by coercion

## Warning in function_list[[k]](value): NAs introduced by coercion

## Warning in function_list[[i]](value): NAs introduced by coercion

## Warning in function_list[[k]](value): NAs introduced by coercion

## Warning in function_list[[i]](value): NAs introduced by coercion

## Warning in function_list[[k]](value): NAs introduced by coercion

## Warning in function_list[[i]](value): NAs introduced by coercion

## Warning in function_list[[k]](value): NAs introduced by coercion

## Warning in function_list[[i]](value): NAs introduced by coercion

## Warning in function_list[[k]](value): NAs introduced by coercion

## Warning in function_list[[i]](value): NAs introduced by coercion

## Warning in function_list[[k]](value): NAs introduced by coercion

## Warning in function_list[[i]](value): NAs introduced by coercion

## Warning in function_list[[k]](value): NAs introduced by coercion

glimpse(timeuse)

## Observations: 1,522
## Variables: 4
## $ sex      <chr> "Males", "Males", "Males", "Males", "Males", "Males",...
## $ country  <chr> "Belgium", "Bulgaria", "Germany", "Estonia", "Spain",...
## $ activity <chr> "personal", "personal", "personal", "personal", "pers...
## $ minutes  <dbl> 645, 714, 640, 635, 671, 704, 676, 646, 653, 644, 631...

Visualize Time Use Data

sleep <- timeuse %>%
  filter(activity == "sleep")
  
# sleep data by country
ggplot(sleep, aes(x = country, y = minutes)) +
  geom_col(aes(fill = sex)) +
  theme(axis.text.x = element_text(angle = 45, size = 8)) + 
  labs(title = 'Sleep Data')

# sleep data by sex
ggplot(sleep, aes(x = sex, y = minutes)) +
  geom_col()+ 
  labs(title = 'Sleep Data')

Conclusions

# calculate average sleep times for gender
sleep %>%
  group_by(sex) %>%
  summarise(avg = mean(minutes))

# get total sleep times
sleep %>%
  group_by(country) %>%
  summarise(total = sum(minutes)) %>%
  arrange(desc(total)) %>%
  head(5)

On average, women sleep more than men
The top 5 countries that sleep the most are Bulgaria, France, Latvia, Spain, and Lithuania

Population Data

Objectives:
- Tidy datset
- Make the data more narrow, i.e. convert each year to a row
- Three columns: Country, Year, Population
- Analyze growth rate of the past 10 years via line graph

# load datset
url <- "https://raw.githubusercontent.com/baroncurtin2/data607/master/project2/data/population.csv"

census <- read_csv(url, col_names = TRUE, col_types = NULL, na = c("", "NA")) %>%
  tbl_df

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   .default = col_character()
## )

## See spec(...) for full column specifications.

# view data
kable(head(census), caption = "Population Data", format = "html") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Tidy Population Data

exclude_list <- c("World", "Asia & Oceania", "Africa", "Europe", "Central & South America", "North America", "Eurasia", "Middle East")

census %<>%
  # mutate X1 -> country
  rename(country = X1) %>%
  # gather all of the year columns
  gather(key = "year", value = "population", -country) %>%
  # filter aggregate countries
  filter(!(country %in% exclude_list)) %>%
  # replace NA and -- with 0
  mutate(population = if_else(is.na(population) | (population == "--"), 0, as.double(population))) %>%
  # sort by country name
  arrange(country)

## Warning in replace_with(out, !condition, false, fmt_args(~false),
## glue("length of {fmt_args(~condition)}")): NAs introduced by coercion

glimpse(census)

## Observations: 6,944
## Variables: 3
## $ country    <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afgha...
## $ year       <chr> "1980", "1981", "1982", "1983", "1984", "1985", "19...
## $ population <dbl> 15.04360, 13.67368, 12.57743, 12.43058, 12.75384, 1...

Visualize Population Data

# subset data to only contain last 10 years
last10years <- census %>%
  filter(year <= 2010 & year >= 2001) %>%
  # group data
  group_by(country) %>%
  # calculate growth rate
  mutate(growth_rate = (population/lag(population, 1)) - 1)

# top 5 countries by population in 2010
top5countries <- census %>%
  filter(year == 2010) %>%
  arrange(desc(population)) %>%
  head(5)

# inner join to only get the top 5 countries' data
top5last10years <- inner_join(last10years, top5countries, by = "country") %>%
  select(1:4) %>%
  rename(year = year.x,
         population = population.x)

# create lineplot of growth rates
ggplot(top5last10years, aes(x = year, y = growth_rate, color = country, group = country)) +
  geom_line() +
  scale_y_continuous(labels = scales::percent)

## Warning: Removed 5 rows containing missing values (geom_path).

Conclusions

Of the top 5 countries in population, all of the countries except for the US were trending downwards in growth over the course of 10 years
India experienced the most consistently highest growth rate over the course of the 10 year span
China experienced the lowest growth rate consistently by a wide margin over the 10 year span

Data 607 - Project2

Baron Curtin

March 8, 2018

Datasets

Survey Data

Tidy our Survey Data

Visualize Survey Data

Conclusions

Time Use Data

Tidy Timeuse Data

Visualize Time Use Data

Conclusions

Population Data

Tidy Population Data

Visualize Population Data

Conclusions