Grando Project 2

options(width = 100)
# This is a standard setup I include so that my working
# directory is set correctly whether I work on one of my
# windows or linux machines.
if (Sys.info()["sysname"] == "Windows") {
    setwd("~/Masters/DATA607/Project2")
} else {
    setwd("~/Documents/Masters/DATA607/Project2")
}

Dataset 1 - Marriage rates

marriage_df <- read.csv(file = "https://raw.githubusercontent.com/fivethirtyeight/data/master/marriage/both_sexes.csv", 
    header = TRUE, sep = ",", stringsAsFactors = FALSE)

For this data I will first gather the different category types to convert the information into long format. I will also reverse the ratio value (1 - ratio) to make the variable represent if a couple has ever been married. I then will remove any rows which have no marriage ratio recorded.

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
marriage_short_df <- gather(marriage_df, "type", "ratio", 4:75) %>% 
    mutate(ratio = 1 - ratio)
marriage_short_df <- marriage_short_df[complete.cases(marriage_short_df[, 
    5]), ]

Now, I perform some analysis on the data by doing the following:

  1. Create a column for age groups by using the extract function
  2. Group by year and age group
  3. Arrange by year and age group
  4. Summarize each age group by taking the mean of the ratio
  5. Plot the data
library(ggplot2)
marriage_summarized_df <- marriage_short_df %>% extract(type, 
    "age_range", "_?(\\d{4})_?") %>% group_by(year, age_range) %>% 
    arrange(year, age_range) %>% summarize(ratio = round(mean(ratio), 
    2))
marriage_summarized_df
## # A tibble: 51 x 3
## # Groups:   year [?]
##     year age_range ratio
##    <int>     <chr> <dbl>
##  1  1960      2534  0.81
##  2  1960      3544  0.92
##  3  1960      4554  0.92
##  4  1970      2534  0.82
##  5  1970      3544  0.92
##  6  1970      4554  0.93
##  7  1980      2534  0.77
##  8  1980      3544  0.92
##  9  1980      4554  0.94
## 10  1990      2534  0.69
## # ... with 41 more rows
ggplot(marriage_summarized_df, aes(x = year, y = ratio, group = age_range, 
    color = age_range)) + geom_line()

I then perform a second summary using age groups and economic status, both taken using the extract function.

marriage_economic_df <- marriage_short_df %>% extract(type, "age_range", 
    "_?(\\d{4})_?", remove = FALSE) %>% extract(type, "economic", 
    "_?(poor|mid|rich)_?") %>% mutate(economic = factor(economic, 
    levels = c("poor", "mid", "rich")))

marriage_economic_df <- marriage_economic_df[complete.cases(marriage_economic_df), 
    ]
marriage_summarized_economic_df <- marriage_economic_df %>% group_by(age_range, 
    year, economic) %>% arrange(age_range, year, economic) %>% 
    summarize(ratio = round(mean(ratio), 2))
marriage_summarized_economic_df
## # A tibble: 153 x 4
## # Groups:   age_range, year [?]
##    age_range  year economic ratio
##        <chr> <int>   <fctr> <dbl>
##  1      2534  1960     poor  0.79
##  2      2534  1960      mid  0.84
##  3      2534  1960     rich  0.77
##  4      2534  1970     poor  0.76
##  5      2534  1970      mid  0.85
##  6      2534  1970     rich  0.80
##  7      2534  1980     poor  0.68
##  8      2534  1980      mid  0.81
##  9      2534  1980     rich  0.81
## 10      2534  1990     poor  0.58
## # ... with 143 more rows
ggplot(marriage_summarized_economic_df, aes(x = year, y = ratio, 
    group = economic, color = economic)) + geom_line() + facet_wrap(~age_range)

Dataset 2. - Catfish

Note, the .csv file starts on the second line and has footnotes at the end so we have to adjust where to start and stop reading

library(stringr)
all_content = readLines("./CatfishFarm.csv")
new_start = all_content[2:8]
catfish_df <- read.csv(textConnection(new_start), header = TRUE, 
    sep = ",", stringsAsFactors = FALSE)
names(catfish_df) <- str_extract(names(catfish_df), "\\d+")
colnames(catfish_df)[1] <- c("category")
catfish_df$category <- str_extract(catfish_df$category, "[[:alpha:]]+\\s?[[:alpha:]]+")
catfish_df[, 2:26] <- apply(catfish_df[, 2:26], 2, function(x) {
    as.numeric(str_replace_all(x, ",", ""))
})

Gather the yearly data to long format.

catfish_cleaned_df <- gather(catfish_df, "year", "count", 2:26)
catfish_cleaned_df$cooking_style <- c("fried")

Create a summary for total catfish population, which is done by:

  1. Group and arrange the data by year.
  2. Summarize the data based on the count values.
catfish_cleaned_df %>% group_by(year) %>% arrange(year) %>% summarise(count = sum(count))
## # A tibble: 25 x 2
##     year   count
##    <chr>   <dbl>
##  1  1992 1729251
##  2  1993 1464106
##  3  1994 1386379
##  4  1995 1482191
##  5  1996 1680201
##  6  1997 1900419
##  7  1998 1832553
##  8  1999 1920771
##  9  2000 2128353
## 10  2001 2204600
## # ... with 15 more rows
ggplot(catfish_cleaned_df %>% group_by(year) %>% arrange(year) %>% 
    summarise(count = sum(count)), aes(x = year, y = count)) + 
    geom_line(group = 1) + theme(axis.text.x = element_text(angle = 90, 
    hjust = 1))

Create a summary for the percentage of fish per year by category:

  1. Set levels of the factor to be consistent with fish sizes
  2. Group and arrange by year and category.
  3. Summarize based on sum of count.
  4. Spread the categories back out to be columns of the data for each year.
  5. Convert the counts to proportions.
  6. Melt the data for ggplot friendly format.
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
catfish_cleaned2_df <- catfish_cleaned_df %>% mutate(category = factor(category, 
    levels = c("Broodfish", "Fingerling", "Stockers", "Small foodsize", 
        "Medium foodsize", "Large foodsize"))) %>% group_by(year, 
    category) %>% arrange(year, category) %>% summarize(count = sum(count)) %>% 
    spread(category, count)
catfish_cleaned2_df[-1] <- catfish_cleaned2_df[-1]/rowSums(catfish_cleaned2_df[-1])
catfish_melt <- melt(catfish_cleaned2_df, id = "year")
ggplot(data = catfish_melt, aes(x = year, y = value, group = variable, 
    color = variable)) + geom_line() + theme(axis.text.x = element_text(angle = 90, 
    hjust = 1))

Dataset 3 - Yoga Searches by state

As in dataset 2, we need to ignore some rows.

all_content = readLines("./20160502_YogaByStateMonth.csv")
new_start = all_content[-2]
yoga_df <- read.csv(textConnection(new_start), header = TRUE, 
    sep = ",", stringsAsFactors = FALSE)
names(yoga_df) <- str_replace_all(str_extract(names(yoga_df), 
    "\\.{1}\\w{2}\\.{1}\\B"), "\\.", "")
colnames(yoga_df)[1] <- c("date")

Here I will clean the data by doing the following:

  1. Gather the states and make them a single variable column called “state”.
  2. Arrange data by date and state
  3. separate the date by year and month.
  4. mutate year and month so that they are numeric.
yoga_cleaned_df <- yoga_df %>% gather("state", "count", 2:52) %>% 
    arrange(date, state) %>% separate(date, c("year", "month"), 
    sep = "-") %>% mutate(year = as.numeric(year)) %>% mutate(month = as.numeric(month))

To find the most popular month of yoga searches (or quarter, or year, etc.) in the last ten years, I will subset data to include only the last ten years. Then I will group by year and month then summarize the count. After that is done, I can simply request the maximum count value be returned

yoga_cleaned_10_df <- subset(yoga_cleaned_df, (year > 2007 & 
    month > 9) | year > 2008) %>% group_by(year, month) %>% summarise(count = sum(count))
yoga_cleaned_10_df[which.max(yoga_cleaned_10_df$count), ]
## # A tibble: 1 x 3
## # Groups:   year [1]
##    year month count
##   <dbl> <dbl> <int>
## 1  2016     1  1803

To try and answer the most questions pertaining to locational popularity in one graphic, I decided to display each state’s count value and apply a fill to show the respective year. In order to do this, the following actions were performed:

  1. Create factors for year and state.
  2. Reverse the state factor order for display reasons.
  3. Group data by state and year.
  4. Summarize based on the sum of the counts
yoga_yr_df <- yoga_cleaned_df %>% arrange(year) %>% mutate(year = factor(year)) %>% 
    mutate(state = factor(state)) %>% mutate(state = factor(state, 
    levels = rev(levels(state)))) %>% group_by(state, year) %>% 
    summarise(count = sum(count))
ggplot(yoga_yr_df, aes(fill = year, x = state, y = count)) + 
    geom_bar(stat = "identity") + scale_fill_discrete() + coord_flip()