Abstract
Exercício relacionado a disciplina de tópicos especiais do programa de Pós-Graduação em Estatística e Experimentação Agropecuária (UFLA). Professor Paulo Henrique.Inicialmente vamos importar os dataset Super Heróis do Kaggle. Faça o download dos dados no site kaggle. Importe do diretório onde se encontra o dado como no exemplo abaixo. Utilizaremos inicialmente dois package: readr, dplyr e ggplot2. Portanto utilize a função library() para deixa-lo disponível na biblioteca do R ou instale caso necessário install.packages().
# Importanto dados heros power e heros informe
hero_info <- read_csv(file = "/home/zarzar/Documentos/UFLA Doutorado/Disciplina/Data Science/Exercícios/Exercício_5/Dados/heroes_information.csv",na=c("","-","NA"))
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## name = col_character(),
## Gender = col_character(),
## `Eye color` = col_character(),
## Race = col_character(),
## `Hair color` = col_character(),
## Height = col_double(),
## Publisher = col_character(),
## `Skin color` = col_character(),
## Alignment = col_character(),
## Weight = col_double()
## )
glimpse(hero_info)
## Observations: 734
## Variables: 11
## $ X1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ name <chr> "A-Bomb", "Abe Sapien", "Abin Sur", "Abomination", …
## $ Gender <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Ma…
## $ `Eye color` <chr> "yellow", "blue", "blue", "green", "blue", "blue", …
## $ Race <chr> "Human", "Icthyo Sapien", "Ungaran", "Human / Radia…
## $ `Hair color` <chr> "No Hair", "No Hair", "No Hair", "No Hair", "Black"…
## $ Height <dbl> 203, 191, 185, 203, -99, 193, -99, 185, 173, 178, 1…
## $ Publisher <chr> "Marvel Comics", "Dark Horse Comics", "DC Comics", …
## $ `Skin color` <chr> NA, "blue", "red", NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Alignment <chr> "good", "good", "good", "bad", "bad", "bad", "good"…
## $ Weight <dbl> 441, 65, 90, 441, -99, 122, -99, 88, 61, 81, 104, 1…
glimpse(hero_info)
## Observations: 734
## Variables: 11
## $ X1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ name <chr> "A-Bomb", "Abe Sapien", "Abin Sur", "Abomination", …
## $ Gender <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Ma…
## $ `Eye color` <chr> "yellow", "blue", "blue", "green", "blue", "blue", …
## $ Race <chr> "Human", "Icthyo Sapien", "Ungaran", "Human / Radia…
## $ `Hair color` <chr> "No Hair", "No Hair", "No Hair", "No Hair", "Black"…
## $ Height <dbl> 203, 191, 185, 203, -99, 193, -99, 185, 173, 178, 1…
## $ Publisher <chr> "Marvel Comics", "Dark Horse Comics", "DC Comics", …
## $ `Skin color` <chr> NA, "blue", "red", NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Alignment <chr> "good", "good", "good", "bad", "bad", "bad", "good"…
## $ Weight <dbl> 441, 65, 90, 441, -99, 122, -99, 88, 61, 81, 104, 1…
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
names(hero_info) # Nomes das variáveis
## [1] "X1" "name" "Gender" "Eye color" "Race"
## [6] "Hair color" "Height" "Publisher" "Skin color" "Alignment"
## [11] "Weight"
hero_info <- clean_names(hero_info) # padonizando os nomes das variáveis
hero_info %>% filter(publisher %in% c("Marvel Comics","DC Comics"))
## # A tibble: 603 x 11
## x1 name gender eye_color race hair_color height publisher
## <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 0 A-Bo… Male yellow Human No Hair 203 Marvel C…
## 2 2 Abin… Male blue Unga… No Hair 185 DC Comics
## 3 3 Abom… Male green Huma… No Hair 203 Marvel C…
## 4 4 Abra… Male blue Cosm… Black -99 Marvel C…
## 5 5 Abso… Male blue Human No Hair 193 Marvel C…
## 6 7 Adam… Male blue Human Blond 185 DC Comics
## 7 8 Agen… Female blue <NA> Blond 173 Marvel C…
## 8 9 Agen… Male brown Human Brown 178 Marvel C…
## 9 10 Agen… Male <NA> <NA> <NA> 191 Marvel C…
## 10 11 Air-… Male blue <NA> White 188 Marvel C…
## # … with 593 more rows, and 3 more variables: skin_color <chr>,
## # alignment <chr>, weight <dbl>
# Tranformando variável
hero_info <- hero_info %>%
mutate(publisher_new = case_when( publisher == "Marvel Comics" ~ "Marvel",
publisher == "DC Comics" ~ "DC",
publisher != c("Marvel Comics","DC Comics") ~ "Outros"
))
hero_info %>% count(publisher_new)
## # A tibble: 4 x 2
## publisher_new n
## <chr> <int>
## 1 DC 215
## 2 Marvel 388
## 3 Outros 116
## 4 <NA> 15
df <- hero_info %>% filter(publisher %in% c("Marvel Comics","DC Comics"))
max(df$weight)
## [1] 900
df%>%filter(weight==900)%>%select(name,weight,publisher_new)
## # A tibble: 1 x 3
## name weight publisher_new
## <chr> <dbl> <chr>
## 1 Sasquatch 900 Marvel
pesado <- df%>%filter(weight>400)%>%select(name,weight,publisher_new)
pesado <- arrange(pesado, publisher_new, weight)
pesado$name <- factor(pesado$name,levels = pesado$name)
pesado$publisher_new <- factor(pesado$publisher_new)
ggplot(pesado, aes(x=name, y=weight, fill=publisher_new))+geom_col()+
theme(axis.text.x = element_text(angle = 45))
Portanto, esse gráfico de barrra, construído com o ggplot2 indica quais são os 14 super-heróis mais fortes/pesados entre a Marvel e o DC.
O gráfico abaixo mostra os 10 super-heróis com maiores variedades de poderes.
hero_powers <- read_csv(file = "/home/zarzar/Documentos/UFLA Doutorado/Disciplina/Data Science/Exercícios/Exercício_5/Dados/super_hero_powers.csv",na=c("","-","NA"))
## Parsed with column specification:
## cols(
## .default = col_logical(),
## hero_names = col_character()
## )
## See spec(...) for full column specifications.
power <- hero_powers %>% filter(hero_names %in% df$name)
n_poder <- function(vetor){
index <- length(which(vetor == TRUE))
return(index)
}
power$poderes <- apply(power,1,n_poder)
aa <- power %>% filter(poderes > 30)
ggplot(aa, aes(x=hero_names ,y=poderes))+geom_col()+
theme(axis.text.x = element_text(angle = 45))+
labs(x="Super-heróis", y="Variedade de poderes")
O gráfico aseguir mostram os 15 poderes mias comuns dos super-heróis.
a <- apply(power,2,n_poder)
a <- sort(a,decreasing = TRUE)
a <- a[1:15]
names(a)
## [1] "Super Strength" "Stamina" "Durability"
## [4] "Super Speed" "Flight" "Agility"
## [7] "Accelerated Healing" "Reflexes" "Energy Blasts"
## [10] "Intelligence" "Invulnerability" "Longevity"
## [13] "Stealth" "Marksmanship" "Telepathy"
a <- data.frame(poderes = names(a), n=a)
rownames(a) <- NULL
a
## poderes n
## 1 Super Strength 302
## 2 Stamina 228
## 3 Durability 215
## 4 Super Speed 207
## 5 Flight 191
## 6 Agility 189
## 7 Accelerated Healing 145
## 8 Reflexes 135
## 9 Energy Blasts 126
## 10 Intelligence 120
## 11 Invulnerability 95
## 12 Longevity 90
## 13 Stealth 87
## 14 Marksmanship 84
## 15 Telepathy 75
ggplot(a, aes(x="", y=n, fill=poderes))+
geom_bar(width = 1, stat = "identity")+
coord_polar("y", start=0)
Gráfico da proposção do gênero de super-heróis em cada universo de desenho em quadrinhos.
# Gráfico
library(ggplot2)
ggplot(df,aes(x=publisher_new,fill=gender))+
geom_bar(position = "dodge",color = "lightblue")+
labs(x="Universo de desenho em quadrinhos", fill="sexo")
Após fazer o cruzamento dos banco de dados (hero_powers e hero_info) separamos os super-heróis inteligenstes e plotamos o gráfico segregando por sexo e os universos dos desenhos em quadrinhos para analisar os dados.
aa <- power %>%
select("hero_names","Intelligence") %>%
filter(hero_names %in% df$name)
glimpse(aa)
## Observations: 521
## Variables: 2
## $ hero_names <chr> "A-Bomb", "Abin Sur", "Abomination", "Abraxas", "Ab…
## $ Intelligence <lgl> FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE…
glimpse(df)
## Observations: 603
## Variables: 12
## $ x1 <dbl> 0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 16, 19, 20…
## $ name <chr> "A-Bomb", "Abin Sur", "Abomination", "Abraxas", "A…
## $ gender <chr> "Male", "Male", "Male", "Male", "Male", "Male", "F…
## $ eye_color <chr> "yellow", "blue", "green", "blue", "blue", "blue",…
## $ race <chr> "Human", "Ungaran", "Human / Radiation", "Cosmic E…
## $ hair_color <chr> "No Hair", "No Hair", "No Hair", "Black", "No Hair…
## $ height <dbl> 203, 185, 203, -99, 193, 185, 173, 178, 191, 188, …
## $ publisher <chr> "Marvel Comics", "DC Comics", "Marvel Comics", "Ma…
## $ skin_color <chr> NA, "red", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ alignment <chr> "good", "good", "bad", "bad", "bad", "good", "good…
## $ weight <dbl> 441, 90, 441, -99, 122, 88, 61, 81, 104, 108, 90, …
## $ publisher_new <chr> "Marvel", "DC", "Marvel", "Marvel", "Marvel", "DC"…
aa <- aa%>%mutate(name = hero_names)
aa <- inner_join(aa,df,by="name")
aa <- aa %>% filter(Intelligence==TRUE)
library(ggplot2)
ggplot(aa,aes(x=publisher_new,fill=gender))+
geom_bar(position = "dodge",color = "lightblue")+
labs(x="Universos de desenho em quadrinhos \n super-heróis inteligenstes",fill="sexo")
ggplot(df, aes(x=weight, y=height))+
geom_point()
herois_limpo <- df %>% filter(weight <=500 & weight >=1 & height >=1 & height <=500)
ggplot(herois_limpo, aes(x=weight, y=height))+
geom_point()