options(width = 100)
# This is a standard setup I include so that my working
# directory is set correctly whether I work on one of my
# windows or linux machines.
if (Sys.info()["sysname"] == "Windows") {
setwd("~/Masters/DATA607/Project2")
} else {
setwd("~/Documents/Masters/DATA607/Project2")
}
marriage_df <- read.csv(file = "https://raw.githubusercontent.com/fivethirtyeight/data/master/marriage/both_sexes.csv",
header = TRUE, sep = ",", stringsAsFactors = FALSE)
For this data I will first gather the different category types to convert the information into long format. I will also reverse the ratio value (1 - ratio) to make the variable represent if a couple has ever been married. I then will remove any rows which have no marriage ratio recorded.
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
marriage_short_df <- gather(marriage_df, "type", "ratio", 4:75) %>%
mutate(ratio = 1 - ratio)
marriage_short_df <- marriage_short_df[complete.cases(marriage_short_df[,
5]), ]
Now, I perform some analysis on the data by doing the following:
library(ggplot2)
marriage_summarized_df <- marriage_short_df %>% extract(type,
"age_range", "_?(\\d{4})_?") %>% group_by(year, age_range) %>%
arrange(year, age_range) %>% summarize(ratio = round(mean(ratio),
2))
marriage_summarized_df
## # A tibble: 51 x 3
## # Groups: year [?]
## year age_range ratio
## <int> <chr> <dbl>
## 1 1960 2534 0.81
## 2 1960 3544 0.92
## 3 1960 4554 0.92
## 4 1970 2534 0.82
## 5 1970 3544 0.92
## 6 1970 4554 0.93
## 7 1980 2534 0.77
## 8 1980 3544 0.92
## 9 1980 4554 0.94
## 10 1990 2534 0.69
## # ... with 41 more rows
ggplot(marriage_summarized_df, aes(x = year, y = ratio, group = age_range,
color = age_range)) + geom_line()
I then perform a second summary using age groups and economic status, both taken using the extract function.
marriage_economic_df <- marriage_short_df %>% extract(type, "age_range",
"_?(\\d{4})_?", remove = FALSE) %>% extract(type, "economic",
"_?(poor|mid|rich)_?") %>% mutate(economic = factor(economic,
levels = c("poor", "mid", "rich")))
marriage_economic_df <- marriage_economic_df[complete.cases(marriage_economic_df),
]
marriage_summarized_economic_df <- marriage_economic_df %>% group_by(age_range,
year, economic) %>% arrange(age_range, year, economic) %>%
summarize(ratio = round(mean(ratio), 2))
marriage_summarized_economic_df
## # A tibble: 153 x 4
## # Groups: age_range, year [?]
## age_range year economic ratio
## <chr> <int> <fctr> <dbl>
## 1 2534 1960 poor 0.79
## 2 2534 1960 mid 0.84
## 3 2534 1960 rich 0.77
## 4 2534 1970 poor 0.76
## 5 2534 1970 mid 0.85
## 6 2534 1970 rich 0.80
## 7 2534 1980 poor 0.68
## 8 2534 1980 mid 0.81
## 9 2534 1980 rich 0.81
## 10 2534 1990 poor 0.58
## # ... with 143 more rows
ggplot(marriage_summarized_economic_df, aes(x = year, y = ratio,
group = economic, color = economic)) + geom_line() + facet_wrap(~age_range)
Note, the .csv file starts on the second line and has footnotes at the end so we have to adjust where to start and stop reading
library(stringr)
all_content = readLines("./CatfishFarm.csv")
new_start = all_content[2:8]
catfish_df <- read.csv(textConnection(new_start), header = TRUE,
sep = ",", stringsAsFactors = FALSE)
names(catfish_df) <- str_extract(names(catfish_df), "\\d+")
colnames(catfish_df)[1] <- c("category")
catfish_df$category <- str_extract(catfish_df$category, "[[:alpha:]]+\\s?[[:alpha:]]+")
catfish_df[, 2:26] <- apply(catfish_df[, 2:26], 2, function(x) {
as.numeric(str_replace_all(x, ",", ""))
})
Gather the yearly data to long format.
catfish_cleaned_df <- gather(catfish_df, "year", "count", 2:26)
catfish_cleaned_df$cooking_style <- c("fried")
Create a summary for total catfish population, which is done by:
catfish_cleaned_df %>% group_by(year) %>% arrange(year) %>% summarise(count = sum(count))
## # A tibble: 25 x 2
## year count
## <chr> <dbl>
## 1 1992 1729251
## 2 1993 1464106
## 3 1994 1386379
## 4 1995 1482191
## 5 1996 1680201
## 6 1997 1900419
## 7 1998 1832553
## 8 1999 1920771
## 9 2000 2128353
## 10 2001 2204600
## # ... with 15 more rows
ggplot(catfish_cleaned_df %>% group_by(year) %>% arrange(year) %>%
summarise(count = sum(count)), aes(x = year, y = count)) +
geom_line(group = 1) + theme(axis.text.x = element_text(angle = 90,
hjust = 1))
Create a summary for the percentage of fish per year by category:
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
catfish_cleaned2_df <- catfish_cleaned_df %>% mutate(category = factor(category,
levels = c("Broodfish", "Fingerling", "Stockers", "Small foodsize",
"Medium foodsize", "Large foodsize"))) %>% group_by(year,
category) %>% arrange(year, category) %>% summarize(count = sum(count)) %>%
spread(category, count)
catfish_cleaned2_df[-1] <- catfish_cleaned2_df[-1]/rowSums(catfish_cleaned2_df[-1])
catfish_melt <- melt(catfish_cleaned2_df, id = "year")
ggplot(data = catfish_melt, aes(x = year, y = value, group = variable,
color = variable)) + geom_line() + theme(axis.text.x = element_text(angle = 90,
hjust = 1))
As in dataset 2, we need to ignore some rows.
all_content = readLines("./20160502_YogaByStateMonth.csv")
new_start = all_content[-2]
yoga_df <- read.csv(textConnection(new_start), header = TRUE,
sep = ",", stringsAsFactors = FALSE)
names(yoga_df) <- str_replace_all(str_extract(names(yoga_df),
"\\.{1}\\w{2}\\.{1}\\B"), "\\.", "")
colnames(yoga_df)[1] <- c("date")
Here I will clean the data by doing the following:
yoga_cleaned_df <- yoga_df %>% gather("state", "count", 2:52) %>%
arrange(date, state) %>% separate(date, c("year", "month"),
sep = "-") %>% mutate(year = as.numeric(year)) %>% mutate(month = as.numeric(month))
To find the most popular month of yoga searches (or quarter, or year, etc.) in the last ten years, I will subset data to include only the last ten years. Then I will group by year and month then summarize the count. After that is done, I can simply request the maximum count value be returned
yoga_cleaned_10_df <- subset(yoga_cleaned_df, (year > 2007 &
month > 9) | year > 2008) %>% group_by(year, month) %>% summarise(count = sum(count))
yoga_cleaned_10_df[which.max(yoga_cleaned_10_df$count), ]
## # A tibble: 1 x 3
## # Groups: year [1]
## year month count
## <dbl> <dbl> <int>
## 1 2016 1 1803
To try and answer the most questions pertaining to locational popularity in one graphic, I decided to display each state’s count value and apply a fill to show the respective year. In order to do this, the following actions were performed:
yoga_yr_df <- yoga_cleaned_df %>% arrange(year) %>% mutate(year = factor(year)) %>%
mutate(state = factor(state)) %>% mutate(state = factor(state,
levels = rev(levels(state)))) %>% group_by(state, year) %>%
summarise(count = sum(count))
ggplot(yoga_yr_df, aes(fill = year, x = state, y = count)) +
geom_bar(stat = "identity") + scale_fill_discrete() + coord_flip()