knitr::opts_chunk$set(warning = FALSE)
knitr::include_graphics("HP.gif")
## You can find the downloadable dataset at this link: Harry
Potter Dataset
library(devtools)
library(gtsummary)
library(ggplot2)
library(magick)
library(gmodels)
library(plyr)
library(dplyr)
library(DT)
library(tidyverse)
library(rmarkdown)
library(dplyr)
library(magrittr)
library(htmltools)
library(vembedr)
library(ggpubr)
library(ggplot2)
library(tibble)
library(rstatix)
library(formattable)
library(data.table)
library(table1)
library(factoextra)
HarryPotter <- read.csv("/cloud/project/HPExport.csv", header=TRUE, stringsAsFactors=FALSE)
head(HarryPotter ,5)
## Name Sex House Race
## 1 (Bill) William Arthur Weasley Male Gryffindor Half-Human
## 2 Aberforth Dumbledore Male <NA> Human
## 3 Alastor Moody Male <NA> Human
## 4 Albus Percival Wulfric Brian Dumbledore Male Gryffindor Human
## 5 Alecto Carrow Female Slytherin Human
## Blood.status Birthdate Deathdate.or.Censor Days Age Loyalty
## 1 Pure-blood 11/29/1970 6/30/1998 10,075.00 27.00 Dumbledore
## 2 Half-blood 9/1/1884 6/30/1996 40,844.00 111.00 Dumbledore
## 3 Pure-blood 1/1/1959 7/27/1997 14,087.00 38.00 Dumbledore
## 4 Half-blood 8/31/1981 6/30/1997 5,782.00 15.83 Dumbledore
## 5 Unknown 12/31/1980 6/30/1998 6,390.00 17.49 Voldemort
## Status
## 1 1
## 2 1
## 3 2
## 4 2
## 5 1
str(HarryPotter)
## 'data.frame': 127 obs. of 11 variables:
## $ Name : chr "(Bill) William Arthur Weasley" "Aberforth Dumbledore" "Alastor Moody" "Albus Percival Wulfric Brian Dumbledore" ...
## $ Sex : chr "Male" "Male" "Male" "Male" ...
## $ House : chr "Gryffindor" NA NA "Gryffindor" ...
## $ Race : chr "Half-Human" "Human" "Human" "Human" ...
## $ Blood.status : chr "Pure-blood" "Half-blood" "Pure-blood" "Half-blood" ...
## $ Birthdate : chr "11/29/1970" "9/1/1884" "1/1/1959" "8/31/1981" ...
## $ Deathdate.or.Censor: chr "6/30/1998" "6/30/1996" "7/27/1997" "6/30/1997" ...
## $ Days : chr "10,075.00" "40,844.00" "14,087.00" "5,782.00" ...
## $ Age : chr "27.00" "111.00" "38.00" "15.83" ...
## $ Loyalty : chr "Dumbledore" "Dumbledore" "Dumbledore" "Dumbledore" ...
## $ Status : int 1 1 2 2 1 1 1 1 1 1 ...
HarryPotter <- subset(HarryPotter, select = -c(Name, Birthdate, Deathdate.or.Censor, Days))
HarryPotter$Age <- as.integer(HarryPotter$Age)
str(HarryPotter)
## 'data.frame': 127 obs. of 7 variables:
## $ Sex : chr "Male" "Male" "Male" "Male" ...
## $ House : chr "Gryffindor" NA NA "Gryffindor" ...
## $ Race : chr "Half-Human" "Human" "Human" "Human" ...
## $ Blood.status: chr "Pure-blood" "Half-blood" "Pure-blood" "Half-blood" ...
## $ Age : int 27 111 38 15 17 19 16 20 27 47 ...
## $ Loyalty : chr "Dumbledore" "Dumbledore" "Dumbledore" "Dumbledore" ...
## $ Status : int 1 1 2 2 1 1 1 1 1 1 ...
HarryPotter <- na.omit(HarryPotter)
HarryPotter2 <- HarryPotter[HarryPotter$Age >= 5, ]
summary(HarryPotter2$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.00 18.00 26.50 44.04 39.50 991.00
str(HarryPotter2)
## 'data.frame': 92 obs. of 7 variables:
## $ Sex : chr "Male" "Male" "Female" "Female" ...
## $ House : chr "Gryffindor" "Gryffindor" "Slytherin" "Gryffindor" ...
## $ Race : chr "Half-Human" "Human" "Human" "Human" ...
## $ Blood.status: chr "Pure-blood" "Half-blood" "Unknown" "Unknown" ...
## $ Age : int 27 15 17 19 16 20 27 47 46 42 ...
## $ Loyalty : chr "Dumbledore" "Dumbledore" "Voldemort" "Dumbledore" ...
## $ Status : int 1 2 1 1 1 1 1 1 1 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:34] 2 3 11 14 15 16 18 22 28 29 ...
## ..- attr(*, "names")= chr [1:34] "2" "3" "11" "14" ...
summary(HarryPotter2)
## Sex House Race Blood.status
## Length:92 Length:92 Length:92 Length:92
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Age Loyalty Status
## Min. : 7.00 Length:92 Min. :1.000
## 1st Qu.: 18.00 Class :character 1st Qu.:1.000
## Median : 26.50 Mode :character Median :1.000
## Mean : 44.04 Mean :1.283
## 3rd Qu.: 39.50 3rd Qu.:2.000
## Max. :991.00 Max. :2.000
HPHouse=HarryPotter2[!grepl("Beauxbatons Academy of Magic",HarryPotter2$House),]
head(HPHouse,5)
## Sex House Race Blood.status Age Loyalty Status
## 1 Male Gryffindor Half-Human Pure-blood 27 Dumbledore 1
## 4 Male Gryffindor Human Half-blood 15 Dumbledore 2
## 5 Female Slytherin Human Unknown 17 Voldemort 1
## 6 Female Gryffindor Human Unknown 19 Dumbledore 1
## 7 Male Slytherin Human Unknown 16 Voldemort 1
HPHouse=HPHouse[!grepl("Durmstrang Institute",HPHouse$House),]
print(freq_table(HPHouse$House))
## # A tibble: 4 × 3
## group n prop
## <chr> <int> <dbl>
## 1 Gryffindor 35 39.3
## 2 Hufflepuff 12 13.5
## 3 Ravenclaw 18 20.2
## 4 Slytherin 24 27
HPHouse=HPHouse[!grepl("Unknown",HPHouse$Loyalty),]
head(HPHouse,2)
## Sex House Race Blood.status Age Loyalty Status
## 1 Male Gryffindor Half-Human Pure-blood 27 Dumbledore 1
## 4 Male Gryffindor Human Half-blood 15 Dumbledore 2
print(freq_table(HPHouse$Loyalty))
## # A tibble: 2 × 3
## group n prop
## <chr> <int> <dbl>
## 1 Dumbledore 63 74.1
## 2 Voldemort 22 25.9
counts <- sort(table(HPHouse$House), decreasing = TRUE)
# Number of states in each region
percentages <- 100 * counts / length(HPHouse$House)
barplot(percentages, ylab = "Percentage", col = "purple")
text(x=seq(0.7, 5, 1.2), 2, paste("n=", counts))
HPHouse$House <- as.factor(HPHouse$House)
levels(HPHouse$House)[levels(HPHouse$House) == "Gryffindor"] <- "Gryffindor"
levels(HPHouse$House)[levels(HPHouse$House) == "Hufflepuff"] <- "Hufflepuff"
levels(HPHouse$House)[levels(HPHouse$House) == "Slytherin"] <- "Slytherin"
levels(HPHouse$House)[levels(HPHouse$House) == "Ravenclaw"] <- "Ravenclaw"
str(HPHouse$House)
## Factor w/ 4 levels "Gryffindor","Hufflepuff",..: 1 1 4 1 4 1 3 4 1 4 ...
print(freq_table(HPHouse$Blood.status))
## # A tibble: 5 × 3
## group n prop
## <chr> <int> <dbl>
## 1 Half-blood 19 22.4
## 2 Muggle-born 6 7.1
## 3 Part-Goblin 1 1.2
## 4 Pure-blood 24 28.2
## 5 Unknown 35 41.2
HPHouse$Blood.status <- as.factor(HPHouse$Blood.status)
levels(HPHouse$Blood.status)[levels(HPHouse$Blood.status) == "Half-blood"] <- "Half-blood"
levels(HPHouse$Blood.status)[levels(HPHouse$Blood.status) == "Muggle-born"] <- "Muggle-born"
levels(HPHouse$Blood.status)[levels(HPHouse$Blood.status) == "Part-Goblin"] <- "Part-Goblin"
levels(HPHouse$Blood.status)[levels(HPHouse$Blood.status) == "Pure-blood"] <- "Pure-blood"
levels(HPHouse$Blood.status)[levels(HPHouse$Blood.status) == "Unknown"] <- "Unknown"
str(HPHouse$Blood.status)
## Factor w/ 5 levels "Half-blood","Muggle-born",..: 4 1 5 5 5 5 1 5 4 5 ...
lbls <- c( "<5", "6-10", "11-15", "16-20", "21-30","31-40","41-55","56-75","76-100","101-1600" )
HPHouse$Age_Cat <- cut( HPHouse$Age, breaks = c( -Inf, 6, 11, 16, 21, 31, 41,56,76,101, Inf ), labels = lbls, right = FALSE )
head(HPHouse,5)
## Sex House Race Blood.status Age Loyalty Status Age_Cat
## 1 Male Gryffindor Half-Human Pure-blood 27 Dumbledore 1 21-30
## 4 Male Gryffindor Human Half-blood 15 Dumbledore 2 11-15
## 5 Female Slytherin Human Unknown 17 Voldemort 1 16-20
## 6 Female Gryffindor Human Unknown 19 Dumbledore 1 16-20
## 7 Male Slytherin Human Unknown 16 Voldemort 1 16-20
factor(HPHouse$Age_Cat, ordered = TRUE)
## [1] 21-30 11-15 16-20 16-20 16-20 16-20 21-30 41-55
## [9] 41-55 41-55 76-100 16-20 21-30 21-30 6-10 21-30
## [17] 16-20 16-20 11-15 31-40 16-20 31-40 31-40 16-20
## [25] 31-40 31-40 16-20 76-100 16-20 31-40 6-10 56-75
## [33] 31-40 16-20 11-15 16-20 56-75 21-30 16-20 101-1600
## [41] 21-30 16-20 16-20 16-20 11-15 21-30 41-55 21-30
## [49] 16-20 31-40 21-30 16-20 16-20 56-75 41-55 76-100
## [57] 11-15 76-100 16-20 16-20 21-30 16-20 16-20 31-40
## [65] 21-30 21-30 41-55 31-40 31-40 6-10 16-20 76-100
## [73] 56-75 76-100 16-20 41-55 31-40 16-20 41-55 21-30
## [81] 31-40 56-75 21-30 41-55 11-15
## 9 Levels: 6-10 < 11-15 < 16-20 < 21-30 < 31-40 < 41-55 < 56-75 < ... < 101-1600
HPHouse$Loyalty <- as.factor(HPHouse$Loyalty)
levels(HPHouse$Loyalty)[levels(HPHouse$Loyalty) == "Dumbledore"] <- "Dumbledore"
levels(HPHouse$Loyalty)[levels(HPHouse$Loyalty) == "Voldemort"] <- "Voldemort"
str(HPHouse$Loyalty)
## Factor w/ 2 levels "Dumbledore","Voldemort": 1 1 2 1 2 1 1 2 1 2 ...
str(HPHouse)
## 'data.frame': 85 obs. of 8 variables:
## $ Sex : chr "Male" "Male" "Female" "Female" ...
## $ House : Factor w/ 4 levels "Gryffindor","Hufflepuff",..: 1 1 4 1 4 1 3 4 1 4 ...
## $ Race : chr "Half-Human" "Human" "Human" "Human" ...
## $ Blood.status: Factor w/ 5 levels "Half-blood","Muggle-born",..: 4 1 5 5 5 5 1 5 4 5 ...
## $ Age : int 27 15 17 19 16 20 27 47 46 42 ...
## $ Loyalty : Factor w/ 2 levels "Dumbledore","Voldemort": 1 1 2 1 2 1 1 2 1 2 ...
## $ Status : int 1 2 1 1 1 1 1 1 1 1 ...
## $ Age_Cat : Factor w/ 10 levels "<5","6-10","11-15",..: 5 3 4 4 4 4 5 7 7 7 ...
## - attr(*, "na.action")= 'omit' Named int [1:34] 2 3 11 14 15 16 18 22 28 29 ...
## ..- attr(*, "names")= chr [1:34] "2" "3" "11" "14" ...
print(freq_table(HPHouse$Sex))
## # A tibble: 3 × 3
## group n prop
## <chr> <int> <dbl>
## 1 "" 1 1.2
## 2 "Female" 29 34.1
## 3 "Male" 55 64.7
print(freq_table(HPHouse$Age_Cat))
## # A tibble: 9 × 3
## group n prop
## <fct> <int> <dbl>
## 1 6-10 3 3.5
## 2 11-15 6 7.1
## 3 16-20 27 31.8
## 4 21-30 15 17.6
## 5 31-40 13 15.3
## 6 41-55 9 10.6
## 7 56-75 5 5.9
## 8 76-100 6 7.1
## 9 101-1600 1 1.2
HPHouse <- HPHouse[-which(HPHouse$Sex == ""), ]
print(freq_table(HPHouse$Sex))
## # A tibble: 2 × 3
## group n prop
## <chr> <int> <dbl>
## 1 Female 29 34.5
## 2 Male 55 65.5
LoyaltyAge <- HPHouse %>%
group_by(Loyalty, House, Status) %>%
tally()
head(LoyaltyAge,5)
## # A tibble: 5 × 4
## # Groups: Loyalty, House [3]
## Loyalty House Status n
## <fct> <fct> <int> <int>
## 1 Dumbledore Gryffindor 1 25
## 2 Dumbledore Gryffindor 2 9
## 3 Dumbledore Hufflepuff 1 6
## 4 Dumbledore Hufflepuff 2 5
## 5 Dumbledore Ravenclaw 1 14
#Rename columns
colnames(LoyaltyAge)[1] <- "Loyalty"
colnames(LoyaltyAge)[2] <-"House"
colnames(LoyaltyAge)[3] <- "Status"
colnames(LoyaltyAge)[4] <- "Count"
head(LoyaltyAge, 5)
## # A tibble: 5 × 4
## # Groups: Loyalty, House [3]
## Loyalty House Status Count
## <fct> <fct> <int> <int>
## 1 Dumbledore Gryffindor 1 25
## 2 Dumbledore Gryffindor 2 9
## 3 Dumbledore Hufflepuff 1 6
## 4 Dumbledore Hufflepuff 2 5
## 5 Dumbledore Ravenclaw 1 14
datatable(LoyaltyAge,extensions = 'Buttons',
options = list(dom='Bfrtip',
buttons=c('copy', 'csv', 'excel', 'print', 'pdf')))
# loading the package
group_by(HPHouse,Loyalty) %>%
summarise(
count = n(),
median = median(Age, na.rm = TRUE),
IQR = IQR(Age, na.rm = TRUE))
## # A tibble: 2 × 4
## Loyalty count median IQR
## <fct> <int> <dbl> <dbl>
## 1 Dumbledore 62 21 18
## 2 Voldemort 22 36 25.5
ggboxplot(HPHouse, x = "Loyalty", y = "Age",
color = "Loyalty", palette = c("purple","hotpink"),
ylab = "Age", xlab = "Loyalty")
res <- wilcox.test(Age ~ Loyalty, conf.int = T, data = HPHouse)
res
##
## Wilcoxon rank sum test with continuity correction
##
## data: Age by Loyalty
## W = 480, p-value = 0.04
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -1.800001e+01 -1.120268e-05
## sample estimates:
## difference in location
## -9.999947
library(doBy)
library(rstatix)
library(gtsummary)
library(tidyverse)
summaryBy(Age ~ House,
data = HPHouse,
FUN = median,
na.rm = TRUE
)
## House Age.median
## 1 Gryffindor 19
## 2 Hufflepuff 19
## 3 Ravenclaw 34
## 4 Slytherin 37
ggplot(HPHouse) +
aes(x = House, y = Age, fill = House) +
geom_boxplot() +
theme(legend.position = "none")
kruskal.test(Age ~ House, data = HPHouse)
##
## Kruskal-Wallis rank sum test
##
## data: Age by House
## Kruskal-Wallis chi-squared = 12.737, df = 3, p-value = 0.005242
pairwise.wilcox.test(HPHouse$Age, HPHouse$House,
p.adjust.method = "BH")
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: HPHouse$Age and HPHouse$House
##
## Gryffindor Hufflepuff Ravenclaw
## Hufflepuff 0.948 - -
## Ravenclaw 0.017 0.054 -
## Slytherin 0.037 0.101 0.948
##
## P value adjustment method: BH
embed_url("https://youtu.be/dxKLEOMfUL4?si=JSwO4RvGaRVF4_sb")