In this lesson students will learn to apply categorical data analysis methods to data sets with fundamentally different structures.
The tidyverse
package is needed for these examples
library(tidyverse)
#install.packages("titanic")
library(titanic)
data("Titanic")
str(Titanic)
## 'table' num [1:4, 1:2, 1:2, 1:2] 0 0 35 0 0 0 17 0 118 154 ...
## - attr(*, "dimnames")=List of 4
## ..$ Class : chr [1:4] "1st" "2nd" "3rd" "Crew"
## ..$ Sex : chr [1:2] "Male" "Female"
## ..$ Age : chr [1:2] "Child" "Adult"
## ..$ Survived: chr [1:2] "No" "Yes"
If we want to be able to use all our helpful dplyr
verbs, we need to reformat this table as a data.frame
.
Titanic<-as.data.frame(Titanic)
Create a one-way frequency table for the distribution of class.
## frequency table for class
## can you think of how you would do this in dplyr?
titanClass<-Titanic%>%
group_by(Class)%>%
summarise(n_class=sum(Freq))
titanClass
## # A tibble: 4 × 2
## Class n_class
## <fct> <dbl>
## 1 1st 325
## 2 2nd 285
## 3 3rd 706
## 4 Crew 885
We might also want to display proportions.
## What could we do if we want proportions?
titanClassProp<-titanClass%>%
mutate(prop=n_class/sum(n_class))
titanClassProp
## # A tibble: 4 × 3
## Class n_class prop
## <fct> <dbl> <dbl>
## 1 1st 325 0.148
## 2 2nd 285 0.129
## 3 3rd 706 0.321
## 4 Crew 885 0.402
Let’s visualize this distribution.
Since the data are already cross-tabulated we must specify the height
of the bars with y=Freq
and
stat = "identity"
.
## bar graphs
ggplot(Titanic, aes(x=Class, y=Freq))+
geom_bar(stat = "identity")
Since bars are two dimensional the color
aesthetic only
outlines bars.
What is going on in this graph?
## add color
ggplot(Titanic, aes(x=Class, y=Freq, color=Class))+
geom_bar(stat = "identity")
## oops! let's use fill
ggplot(Titanic, aes(x=Class, y=Freq, fill=Class))+
geom_bar(stat = "identity")
If we want the height of the bar to be a non-integer value (such as
proportions) we can use geom_col
.
## change y-axis
ggplot(titanClassProp,
aes(x=Class, y=prop, fill=Class))+
geom_col()
Make a stacked bar graph.
## Pie chart
## 1) Start with a stacked bar
ggplot(Titanic, aes(x=1, y=Freq, fill=Class))+
geom_bar(stat = "identity")
Use polar coordinates
## 2) plot it in a circle
ggplot(Titanic, aes(x=1, y=Freq, fill=Class))+
geom_bar(stat = "identity")+
coord_polar("y", start=0)+
theme_void()
## Joint distributions
titan2Way<-Titanic%>%
group_by(Class, Survived)%>%
summarise(tot=sum(Freq))
## `summarise()` has grouped output by 'Class'. You can override using the
## `.groups` argument.
titan2Way
## # A tibble: 8 × 3
## # Groups: Class [4]
## Class Survived tot
## <fct> <fct> <dbl>
## 1 1st No 122
## 2 1st Yes 203
## 3 2nd No 167
## 4 2nd Yes 118
## 5 3rd No 528
## 6 3rd Yes 178
## 7 Crew No 673
## 8 Crew Yes 212
## Now Joint
titanJoint<-titan2Way%>%
mutate(joint=tot/2201)
titanJoint
## # A tibble: 8 × 4
## # Groups: Class [4]
## Class Survived tot joint
## <fct> <fct> <dbl> <dbl>
## 1 1st No 122 0.0554
## 2 1st Yes 203 0.0922
## 3 2nd No 167 0.0759
## 4 2nd Yes 118 0.0536
## 5 3rd No 528 0.240
## 6 3rd Yes 178 0.0809
## 7 Crew No 673 0.306
## 8 Crew Yes 212 0.0963
sum(titanJoint$joint)
## [1] 1
## Marginal Class
titanMargClass<-titanJoint%>%
group_by(Class)%>%
summarise(marginal=sum(joint))
titanMargClass
## # A tibble: 4 × 2
## Class marginal
## <fct> <dbl>
## 1 1st 0.148
## 2 2nd 0.129
## 3 3rd 0.321
## 4 Crew 0.402
## Marginal Survival
titanMargSurv<-titanJoint%>%
group_by(Survived)%>%
summarise(marginal=sum(joint))
titanMargSurv
## # A tibble: 2 × 2
## Survived marginal
## <fct> <dbl>
## 1 No 0.677
## 2 Yes 0.323
## Conditional on Class
titanCondClass<-titan2Way%>%
left_join(titanClass)%>%
mutate(cond=tot/n_class)
## Joining, by = "Class"
titanCondClass
## # A tibble: 8 × 5
## # Groups: Class [4]
## Class Survived tot n_class cond
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 1st No 122 325 0.375
## 2 1st Yes 203 325 0.625
## 3 2nd No 167 285 0.586
## 4 2nd Yes 118 285 0.414
## 5 3rd No 528 706 0.748
## 6 3rd Yes 178 706 0.252
## 7 Crew No 673 885 0.760
## 8 Crew Yes 212 885 0.240
## stacked
ggplot(Titanic, aes(x=Class, y=Freq, fill=Survived))+
geom_bar(stat = "identity")+
theme_bw()
## side-by-side
ggplot(Titanic, aes(x=Class, y=Freq, fill=Survived))+
geom_bar(stat = "identity",
position="dodge")+
theme_bw()
## filled
ggplot(Titanic, aes(x=Class, y=Freq, fill=Survived))+
geom_bar(stat = "identity",
position="fill")+
theme_bw()
It is cognitively very difficult to compare pies!
## comparing pies
ggplot(Titanic, aes(x=1, y=Freq, fill=Class))+
geom_bar(stat = "identity", position = "fill")+
facet_grid(.~Survived)+
coord_polar("y", start=0)+
theme_void()
## USING FACET_GRID
ggplot(Titanic, aes(x=1, y=Freq, fill=Survived))+
geom_bar(stat = "identity", position = "fill")+
facet_grid(.~Class)+
coord_polar("y", start=0)+
theme_void()
## USING FACET_WRAP
ggplot(Titanic, aes(x=1, y=Freq, fill=Survived))+
geom_bar(stat = "identity", position = "fill")+
facet_wrap(.~Class, nrow=2, ncol=2)+
coord_polar("y", start=0)+
theme_void()
Nine-hundred and ten (910) randomly sampled registered voters from Tampa, FL were asked if they thought workers who have illegally entered the US should be (i) allowed to keep their jobs and apply for US citizenship, (ii) allowed to keep their jobs as temporary guest workers but not allowed to apply for US citizenship, or (iii) lose their jobs and have to leave the country. The results of the survey by political ideology are shown below.
From Questions from Introduction to Modern Statistics.
#install.packages("openintro")
library(openintro)
data("immigration")
str(immigration)
## tibble [910 × 2] (S3: tbl_df/tbl/data.frame)
## $ response : Factor w/ 4 levels "Apply for citizenship",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ political: Factor w/ 3 levels "conservative",..: 1 1 1 1 1 1 1 1 1 1 ...
By default R will order a variable alphabetically, but we might not want that.
immigration$political<-as.character(immigration$political)
immigration$political<-factor(immigration$political,
levels = c("conservative","moderate", "liberal"))
In this dataset the rows represent individuals. In the following
example we will learn how to use the table
and
prop.table
functions.
What percent of these Tampa, FL voters identify themselves as conservatives?
# Table for Political affiliation
# use table() function
tabPol<-table(immigration$political)
tabPol
##
## conservative moderate liberal
## 372 363 175
# the prop.table() function must take a table object
prop.table(tabPol)
##
## conservative moderate liberal
## 0.4087912 0.3989011 0.1923077
# create a graph to display the distribution
ggplot(immigration, aes(x=political, fill=political))+
geom_bar()
We can also use kable
to make tables in R markdown:
library(knitr)
kable(tabPol, col.names = c('Party', 'Count'),
caption = "Distribution of Political Indentities")
Party | Count |
---|---|
conservative | 372 |
moderate | 363 |
liberal | 175 |
What percent of Tampa, FL voters are in favor of the citizenship option?
# Table for citizenship response
# use table() function
tabResp<-table(immigration$response)
tabResp
##
## Apply for citizenship Guest worker Leave the country
## 278 262 350
## Not sure
## 20
# kable
kable(tabResp, col.names = c('Response', 'Count'),
caption = "Distribution of Response to Citizenship")
Response | Count |
---|---|
Apply for citizenship | 278 |
Guest worker | 262 |
Leave the country | 350 |
Not sure | 20 |
# use prop.table()
prop.table(tabResp)
##
## Apply for citizenship Guest worker Leave the country
## 0.30549451 0.28791209 0.38461538
## Not sure
## 0.02197802
# create a graph to display the distribution
ggplot(immigration, aes(x=response, fill=response))+
geom_bar()
# stacked bar graph
ggplot(immigration, aes(x=1, fill=response))+
geom_bar()
# pie graph
ggplot(immigration, aes(x=1, fill=response))+
geom_bar()+
coord_polar("y", start=0)+
theme_void()
What percent of these Tampa, FL voters identify themselves as conservatives and are in favor of the citizenship option?
## conservative and citizen
# Row then col
tabPolResp<-table(immigration$political, immigration$response)
tabPolResp
##
## Apply for citizenship Guest worker Leave the country Not sure
## conservative 57 121 179 15
## moderate 120 113 126 4
## liberal 101 28 45 1
## kable
kable(tabPolResp)
Apply for citizenship | Guest worker | Leave the country | Not sure | |
---|---|---|---|---|
conservative | 57 | 121 | 179 | 15 |
moderate | 120 | 113 | 126 | 4 |
liberal | 101 | 28 | 45 | 1 |
## joint
prop.table(tabPolResp)
##
## Apply for citizenship Guest worker Leave the country Not sure
## conservative 0.062637363 0.132967033 0.196703297 0.016483516
## moderate 0.131868132 0.124175824 0.138461538 0.004395604
## liberal 0.110989011 0.030769231 0.049450549 0.001098901
sum(prop.table(tabPolResp))
## [1] 1
## kable
kable(round(prop.table(tabPolResp),2))
Apply for citizenship | Guest worker | Leave the country | Not sure | |
---|---|---|---|---|
conservative | 0.06 | 0.13 | 0.20 | 0.02 |
moderate | 0.13 | 0.12 | 0.14 | 0.00 |
liberal | 0.11 | 0.03 | 0.05 | 0.00 |
What percent of these Tampa, FL voters who identify themselves as conservatives are also in favor of the citizenship option? What percent of moderates share this view? What percent of liberals share this view?
## marginal prop
prop.table(tabPolResp, 1)
##
## Apply for citizenship Guest worker Leave the country Not sure
## conservative 0.153225806 0.325268817 0.481182796 0.040322581
## moderate 0.330578512 0.311294766 0.347107438 0.011019284
## liberal 0.577142857 0.160000000 0.257142857 0.005714286
ggplot(immigration, aes(x=political, fill=response))+
geom_bar(position="fill")
In 1973, UC Berkeley became “one of the first universities to be sued for sexual discrimination” (with a statistically significant difference)
## UC Berk
data(UCBAdmissions)
str(UCBAdmissions)
## 'table' num [1:2, 1:2, 1:6] 512 313 89 19 353 207 17 8 120 205 ...
## - attr(*, "dimnames")=List of 3
## ..$ Admit : chr [1:2] "Admitted" "Rejected"
## ..$ Gender: chr [1:2] "Male" "Female"
## ..$ Dept : chr [1:6] "A" "B" "C" "D" ...
cal<-as.data.frame(UCBAdmissions)
ggplot(cal, aes(x=Gender, y= Freq, fill=Admit))+
geom_bar(stat = "identity",
position="fill")
ggplot(cal, aes(x=Gender, y= Freq, fill=Admit))+
geom_bar(stat = "identity",
position="fill")+
facet_grid(.~Dept)
How does this happen?
“The simple explanation is that women tended to apply to the departments that are the hardest to get into, and men tended to apply to departments that were easier to get into. (Humanities departments tended to have less research funding to support graduate students, while science and engineer departments were awash with money.) So women were rejected more than men. Presumably, the bias wasn’t at Berkeley but earlier in women’s education, when other biases led them to different fields of study than men.”