library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(RColorBrewer)
library(readr)
heroes_information <- read_csv("heroes_information.csv")
## New names:
## * `` -> ...1
## Rows: 734 Columns: 11
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (8): name, Gender, Eye color, Race, Hair color, Publisher, Skin color, A...
## dbl (3): ...1, Height, Weight
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(heroes_information)
1.Examine the data set with str, dim, etc. What do you notice about the data? Are there any potential problems?
str(heroes_information)
## spec_tbl_df [734 x 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ...1 : num [1:734] 0 1 2 3 4 5 6 7 8 9 ...
## $ name : chr [1:734] "A-Bomb" "Abe Sapien" "Abin Sur" "Abomination" ...
## $ Gender : chr [1:734] "Male" "Male" "Male" "Male" ...
## $ Eye color : chr [1:734] "yellow" "blue" "blue" "green" ...
## $ Race : chr [1:734] "Human" "Icthyo Sapien" "Ungaran" "Human / Radiation" ...
## $ Hair color: chr [1:734] "No Hair" "No Hair" "No Hair" "No Hair" ...
## $ Height : num [1:734] 203 191 185 203 -99 193 -99 185 173 178 ...
## $ Publisher : chr [1:734] "Marvel Comics" "Dark Horse Comics" "DC Comics" "Marvel Comics" ...
## $ Skin color: chr [1:734] "-" "blue" "red" "-" ...
## $ Alignment : chr [1:734] "good" "good" "good" "bad" ...
## $ Weight : num [1:734] 441 65 90 441 -99 122 -99 88 61 81 ...
## - attr(*, "spec")=
## .. cols(
## .. ...1 = col_double(),
## .. name = col_character(),
## .. Gender = col_character(),
## .. `Eye color` = col_character(),
## .. Race = col_character(),
## .. `Hair color` = col_character(),
## .. Height = col_double(),
## .. Publisher = col_character(),
## .. `Skin color` = col_character(),
## .. Alignment = col_character(),
## .. Weight = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
dim(heroes_information)
## [1] 734 11
heroes_information %>%
filter(Race == "Human")
## # A tibble: 208 x 11
## ...1 name Gender `Eye color` Race `Hair color` Height Publisher
## <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 0 A-Bomb Male yellow Human No Hair 203 Marvel C~
## 2 5 Absorbing Man Male blue Human No Hair 193 Marvel C~
## 3 7 Adam Strange Male blue Human Blond 185 DC Comics
## 4 9 Agent Bob Male brown Human Brown 178 Marvel C~
## 5 14 Alex Mercer Male - Human - -99 Wildstorm
## 6 16 Alfred Pennywor~ Male blue Human Black 178 DC Comics
## 7 20 Ammo Male brown Human Black 188 Marvel C~
## 8 27 Animal Man Male blue Human Blond 183 DC Comics
## 9 29 Ant-Man Male blue Human Blond 211 Marvel C~
## 10 30 Ant-Man II Male blue Human Blond 183 Marvel C~
## # ... with 198 more rows, and 3 more variables: `Skin color` <chr>,
## # Alignment <chr>, Weight <dbl>
###208 superheroes are Human
3.Find the frequency of Gender values. How many female superheroes are there?
table(heroes_information$Gender)
##
## - Female Male
## 29 200 505
#There are 200 female superheroes.
4.Create a contingency table for Gender and Alignment. What do you notice about this analysis?
table(heroes_information$Gender, heroes_information$Alignment)
##
## - bad good neutral
## - 1 7 19 2
## Female 0 35 161 4
## Male 6 165 316 18
5.Filter the data to remove rows where Height is less than 0. Do the same for Weight. Use this filtered data for questions 6 through 10 below.
HI_DF <- heroes_information %>%
filter(Height > 0) %>%
filter(Weight > 0)
6.Calculate the mean, median and standard deviation of superhero Weight and Height.
mean(HI_DF$Height)
## [1] 187.1239
median(HI_DF$Height)
## [1] 183
sd(HI_DF$Height)
## [1] 58.99002
mean(HI_DF$Weight)
## [1] 112.1796
median(HI_DF$Weight)
## [1] 81
sd(HI_DF$Weight)
## [1] 104.4227
7.Create a histogram of Weight. Describe the distribution and what it tells us.
Distribution is skewed to the RIGHT. Right-skewed distribution - Positive
ggplot(HI_DF, aes(HI_DF$Weight)) + geom_histogram()+
xlab("Weight")+
ylab("Count") +
theme_minimal(base_size = 10)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot Height vs. Weight. Do you see any correlation? Describe what you see.
As height increases, so does weight. In this data set I noticed that the Superheros who stand around 180 - 250 cn all weighed around 69-90.
p1H <- HI_DF %>%
ggplot(aes(x= Height, y= Weight )) + geom_point() + geom_line()+
xlab("Height")+
ylab("Weight") +
theme_minimal(base_size = 9)
p1H
Calculate the five-number summary values for Height and Weight. Create a box plot of Weight for Alignment. What does this chart tell us?
Height5Num <- summary(HI_DF$Height)
Weight5Num <- summary(HI_DF$Weight)
Height5Num
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.2 173.5 183.0 187.1 188.0 975.0
Weight5Num
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.0 61.0 81.0 112.2 106.0 900.0
Boxplot Weight for Alignment
as.numeric(HI_DF$Alignment)
## Warning: NAs introduced by coercion
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [26] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [51] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [76] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [101] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [126] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [151] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [176] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [201] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [226] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [251] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [276] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [301] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [326] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [351] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [376] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [401] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [426] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [451] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [476] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
boxplot(x=HI_DF$Weight, y= heroes_information$Alignment)
10.Filter the data to include only the two largest publishers. Which publisher has a higher proportion of bad aligned superheroes? Hint: One way to do this is to use dplyr and group by the two columns Publisher and Alignment rather than a single column.
Marvel Comics
HI_DF %>%
select(Publisher, Alignment) %>%
group_by(Publisher, Alignment) %>%
filter(Alignment == "bad")
## # A tibble: 138 x 2
## # Groups: Publisher, Alignment [6]
## Publisher Alignment
## <chr> <chr>
## 1 Marvel Comics bad
## 2 Marvel Comics bad
## 3 Marvel Comics bad
## 4 Marvel Comics bad
## 5 Dark Horse Comics bad
## 6 DC Comics bad
## 7 Marvel Comics bad
## 8 Marvel Comics bad
## 9 Marvel Comics bad
## 10 Marvel Comics bad
## # ... with 128 more rows