# Load dplyr package
library(dplyr) #for use of dplyr functions such as glimpse(), mutate(), and filter()
library(ggplot2) #for use of ggplot2 functions such ggplot()
# Import data
comics <- read.csv("/resources/rstudio/Business Statistics/Data/comics.csv")
# Convert data to tbl_df
comics <- tbl_df(comics)
str(comics)
## Classes 'tbl_df', 'tbl' and 'data.frame': 23272 obs. of 11 variables:
## $ name : Factor w/ 23272 levels "'Spinner (Earth-616)",..: 19833 3335 22769 9647 20956 2220 17576 9346 18794 10957 ...
## $ id : Factor w/ 4 levels "No Dual","Public",..: 3 2 2 2 1 2 2 2 2 2 ...
## $ align : Factor w/ 4 levels "Bad","Good","Neutral",..: 2 2 3 2 2 2 2 2 3 2 ...
## $ eye : Factor w/ 26 levels "Amber Eyes","Auburn Hair",..: 11 5 5 5 5 5 6 6 6 5 ...
## $ hair : Factor w/ 28 levels "Auburn Hair",..: 7 27 3 3 4 14 7 7 7 4 ...
## $ gender : Factor w/ 3 levels "Female","Male",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ gsm : Factor w/ 6 levels "Bisexual Characters",..: NA NA NA NA NA NA NA NA NA NA ...
## $ alive : Factor w/ 2 levels "Deceased Characters",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ appearances : int 4043 3360 3061 2961 2258 2255 2072 2017 1955 1934 ...
## $ first_appear: Factor w/ 2328 levels "1-Apr","1-Aug",..: 1772 2074 2255 2089 2185 2192 2192 2139 2292 2192 ...
## $ publisher : Factor w/ 2 levels "dc","marvel": 2 2 2 2 2 2 2 2 2 2 ...
# Check the levels of gender
levels(comics$gender)
## [1] "Female" "Male" "Other"
levels(comics$hair)
## [1] "Auburn Hair" "Bald"
## [3] "Black Hair" "Blond Hair"
## [5] "Blue Hair" "Bronze Hair"
## [7] "Brown Hair" "Dyed Hair"
## [9] "Gold Hair" "Green Hair"
## [11] "Grey Hair" "Light Brown Hair"
## [13] "Magenta Hair" "No Hair"
## [15] "Orange Hair" "Orange-brown Hair"
## [17] "Pink Hair" "Platinum Blond Hair"
## [19] "Purple Hair" "Red Hair"
## [21] "Reddish Blond Hair" "Reddish Brown Hair"
## [23] "Silver Hair" "Strawberry Blond Hair"
## [25] "Variable Hair" "Violet Hair"
## [27] "White Hair" "Yellow Hair"
# Create a 2-way contingency table
tab <- table(comics$publisher, comics$gender)
tab <- table(comics$publisher, comics$hair)
# Print tab
tab
##
## Auburn Hair Bald Black Hair Blond Hair Blue Hair Bronze Hair
## dc 0 0 1574 744 41 0
## marvel 78 838 3755 1582 56 1
##
## Brown Hair Dyed Hair Gold Hair Green Hair Grey Hair
## dc 1148 0 5 42 157
## marvel 2339 1 8 117 531
##
## Light Brown Hair Magenta Hair No Hair Orange Hair
## dc 0 0 0 21
## marvel 6 5 1176 43
##
## Orange-brown Hair Pink Hair Platinum Blond Hair Purple Hair
## dc 0 11 2 32
## marvel 3 31 0 47
##
## Red Hair Reddish Blond Hair Reddish Brown Hair Silver Hair
## dc 461 0 3 3
## marvel 620 6 0 16
##
## Strawberry Blond Hair Variable Hair Violet Hair White Hair
## dc 28 0 4 346
## marvel 47 32 0 754
##
## Yellow Hair
## dc 0
## marvel 20
# Remove align level
comics <- comics %>%
filter(align != "Red-Haired") %>%
droplevels()
# Create plot of align
ggplot(comics, aes(x = gender)) +
geom_bar()
ggplot(comics, aes(x = hair)) +
geom_bar()
# Plot proportion of gender, conditional on align
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar()
# Plot proportion of hair, conditional on align
ggplot(comics, aes(x = align, fill = hair)) +
geom_bar()
# Plot proportion of gender, conditional on align
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position = "fill") #position = "fill", to have a stacked barchart
# Plot proportion of hair, conditional on align
ggplot(comics, aes(x = align, fill = hair)) +
geom_bar(position = "fill") #position = "fill", to have a stacked barchart
# Create side-by-side barchart of gender by alignment
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position = "dodge") #position = "dodge", to have a side-by-side (i.e. not stacked) barchart.
# Create side-by-side barchart of hair by alignment
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position = "dodge") #position = "dodge", to have a side-by-side (i.e. not stacked) barchart.
# Plot of alignment broken down by gender
ggplot(comics, aes(x = align)) +
geom_bar() +
facet_wrap(~ hair)# Plot of alignment broken down by hair
ggplot(comics, aes(x = align)) +
geom_bar() +
facet_wrap(~hair)
Interpretation
Approximately what proportion of all female characters are good? 51%. Look at how align was distributed within each gender
tab <- table(comics$align, comics$gender)
options(scipen = 999, digits = 3) # Print fewer digits
prop.table(tab) # Joint proportions
##
## Female Male Other
## Bad 0.0821968 0.3950985 0.0016722
## Good 0.1301144 0.2512933 0.0008883
## Neutral 0.0436850 0.0940064 0.0008883
## Reformed Criminals 0.0000523 0.0001045 0.0000000
prop.table(tab, 1) # Conditional on rows
##
## Female Male Other
## Bad 0.17161 0.82490 0.00349
## Good 0.34035 0.65733 0.00232
## Neutral 0.31523 0.67836 0.00641
## Reformed Criminals 0.33333 0.66667 0.00000
prop.table(tab, 2) # Conditional on columns
##
## Female Male Other
## Bad 0.321020 0.533554 0.484848
## Good 0.508163 0.339355 0.257576
## Neutral 0.170612 0.126949 0.257576
## Reformed Criminals 0.000204 0.000141 0.000000
Interpretation