if (!require('dplyr')) install.packages('dplyr')
if (!require('tidyr')) install.packages('tidyr')
if (!require('DT')) install.packages('DT')
if (!require('ggplot2')) install.packages('ggplot2')
if (!require('xlsx')) install.packages('xlsx')
if (!require('reshape2')) install.packages('reshape2')
Let’s revise the original question offerings, knowing that these are projections based on the 2010 data, and instead ask:
pop <- read.csv("PEP_2015_PEPANNRES_with_ann.csv")
pop_meta <- read.csv("PEP_2015_PEPANNRES_metadata.csv")
knitr::kable(pop_meta)
| GEO.id | Id |
|---|---|
| GEO.id2 | Id2 |
| GEO.display-label | Geography |
| rescen42010 | April 1, 2010 - Census |
| resbase42010 | April 1, 2010 - Estimates Base |
| respop72010 | Population Estimate (as of July 1) - 2010 |
| respop72011 | Population Estimate (as of July 1) - 2011 |
| respop72012 | Population Estimate (as of July 1) - 2012 |
| respop72013 | Population Estimate (as of July 1) - 2013 |
| respop72014 | Population Estimate (as of July 1) - 2014 |
| respop72015 | Population Estimate (as of July 1) - 2015 |
We learn that “The population count or estimate used as the starting point in the estimates process. It can be the last Census count or the estimate for a previous date. Also referred to as the”base population“. If we compared the rescen42010 or the resbase42010 variables to the 2015 projections, they would not reflect the same time period (Apr vs July). As a result, we conclude that we are only interested in looking at the respop72010 population estimate variable.
We are also only interested in states (and DC + Puerto Rico), not the US as a whole or particular regions.
I have opted to leave the data in wide format, since it made it easier to calculate a the summary statistic “perdiff”, indicating the percent difference between the 2015 and 2010 population estimates.
# select columns and obvs
pop <- pop[6:57, ] %>%
select(GEO.display.label, respop72010, respop72015) %>%
mutate(diff=respop72015 - respop72010, perdiff=round((diff / respop72010) * 100, 3))
datatable(pop)
We can use the data table to easily sort through the summary statistic perdiff to find the states with the most estimated population growth (North Dakota, +12.215%) and least population growth (Puerto Rico, -6.646%).
We can calculate the mean population difference, and then look for the states within a range of the mean.
m <- round(mean(pop$perdiff), 3)
m
[1] 3.443
pop[pop$perdiff > m - .1 & pop$perdiff < m + .1,] %>%
arrange(perdiff)
GEO.display.label respop72010 respop72015 diff perdiff
1 Minnesota 5310903 5489594 178691 3.365
2 Alaska 714021 738432 24411 3.419
3 Massachusetts 6565036 6794422 229386 3.494
Let’s attempt to see if there is a correlation between alcohol consumption and gender. Once our data is prepared, we can use a Chi square test for independence to determine if there is a relationship between these variables.
NOTE: The UCI ML Repository offers a handy .R file to create a single dataset that merges the Math and Portuguese classes. We used that file to create the .csv imported below.
adat <- read.csv("studentalcohol.csv")
We are only interested in a few variables here, Gender and Alcohol Consumption. Because of the way the csv was created, merging two other data frames representing classes for Math and Portuguese, we have duplicate observations of alcohol consumption.
alco <- adat %>%
select(sex, Dalc.x, Dalc.y, Walc.x, Walc.y)
table(alco$Dalc.x - alco$Dalc.y)
-1 0 1
3 377 2
From this table we can see that there are some slight differences in the way people are reporting alcohol consumption between their classes (or errors in the data). Just to be sure, let’s create a variable that takes and average Dalc and Walc. We will then use those averages to create a summary statistic using a method introduced in the paper cited above.
\[Alc = \frac{Walc * 2 + Dalc * 5}{7}\]
alco <- alco %>%
mutate(Dalc = (Dalc.x+Dalc.y)/2, Walc = (Walc.x+Walc.y)/2) %>%
mutate(alc = round((Dalc * 5 + Walc * 2)/7)) %>%
select(sex, alc)
table(alco$alc, alco$sex)
F M
1 129 79
2 55 52
3 12 33
4 1 12
5 1 8
Let’s setup a chi-square to test for independence between these two categorical variables.
\(H_0:\) Sex and Alc are independant
\(H_A:\) Sex and Alc are not independant
\(\alpha: .05\)
ggplot(alco,aes(x=alc))+geom_density()+facet_grid(~sex)
chisq.test(table(alco$alc, alco$sex))
Pearson's Chi-squared test
data: table(alco$alc, alco$sex)
X-squared = 36.191, df = 4, p-value = 2.643e-07
Our p-value is well below our \(\alpha\), so we can reject the \(H_0\) and determine that gender and alcohol consumption are not independant.
Let’s look at the following variables and try and come up with some good visualizations to plot their relationships.
ldat <- read.csv("LoanStats_2016Q2.csv")
loan_meta <- read.xlsx("LCDataDictionary.xlsx", 1)
loan_meta <- loan_meta[,1:2]
#knitr::kable(loan_meta)
#subset
loan <- ldat %>%
select(annual_inc, loan_amnt, int_rate, grade)
#remove NA obs
loan <- loan[complete.cases(loan), ]
# make int_rate useable
loan$int_rate <- as.numeric(unlist(strsplit(as.character(loan$int_rate), '%')))
# bin loan amount, and income amounts
loan$loanbin <- findInterval(loan$loan_amnt, seq(0, 30000, 10000))
loan$loanbin[loan$loanbin==1] <- "<$10,000"
loan$loanbin[loan$loanbin==2] <- "<$20,000"
loan$loanbin[loan$loanbin==3] <- "<$30,000"
loan$loanbin[loan$loanbin==4] <- "<=$40,000"
loan$loanbin <- factor(loan$loanbin, levels=c("<$10,000", "<$20,000", "<$30,000", "<=$40,000"))
# bin annual income
loan$incomebin <- findInterval(loan$annual_inc, seq(0, 100000, 50000))
loan$incomebin[loan$incomebin==1] <- "<$50,000"
loan$incomebin[loan$incomebin==2] <- "<$100,000"
loan$incomebin[loan$incomebin==3] <- ">=$100,000"
#loan$incomebin <- as.factor(loan$incomebin)
loan$incomebin <- factor(loan$incomebin, levels=c("<$50,000", "<$100,000", ">=$100,000"))
# bin interest rates
loan$intbin <- findInterval(loan$int_rate, seq(5, 15, 3))
loan$intbin[loan$intbin==1] <- "5-8%"
loan$intbin[loan$intbin==2] <- "8-11%"
loan$intbin[loan$intbin==3] <- "11-14%"
loan$intbin[loan$intbin==4] <- "14-31%"
loan$intbin <- factor(loan$intbin, levels=c("5-8%", "8-11%", "11-14%", "14-31%"))
datatable(loan)
ggplot(loan, aes(x=loan_amnt)) + geom_histogram(aes(fill=..count..)) + facet_grid(.~grade) + labs(title = "Distribution of Loan Amounts by Grade", xlab="") + theme(axis.text.x = element_text(angle=90, hjust=1))
ggplot(loan, aes(x=factor(incomebin), y=loan_amnt, fill=factor(intbin))) + geom_boxplot() + facet_grid(. ~ grade) + labs(title = "Boxplot of Loan Amount by Income, Grade, and Interest Rate", x="Income", y="Loan Amount") + theme(axis.text.x = element_text(angle=90, hjust=1)) + scale_fill_discrete(name="Interest\nRate")
ggplot(loan, aes(x=factor(loanbin), y=int_rate)) + geom_violin(alpha=0.5, color="gray") + geom_jitter(alpha=0.1, aes(color=grade)) + facet_grid(. ~ incomebin) + labs(title = "Violin Plot of Interest Rate by Loan Amount, Income, and Grade", x="Loan Amount", y="Interest Rate") + theme(axis.text.x = element_text(angle=90, hjust=1)) + scale_fill_discrete(name="Annual\nIncome")
By binning our numeric variables into ordered categorical variables we are able to create visualizations that can quickly capture the relationships between the variables.
Let us conclude by using the code in the handy guide linked to below to create a correlation matrix of our numeric variables using Pearson’s R and the cor function.
#let's also get a correlation of our numeric vars
thecor<-round(cor(loan[,c("annual_inc", "loan_amnt", "int_rate")], method="pearson"),2)
thecor[lower.tri(thecor)]<-NA
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
upper_tri <- get_upper_tri(thecor)
melted_cormat <- melt(upper_tri, na.rm = TRUE)
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed() + geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.justification = c(1, 0),
legend.position = c(0.6, 0.7),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5))