library(knitr)
library(rmarkdown)
library(magrittr)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
my_url <- "https://raw.githubusercontent.com/geedoubledee/R/main/rpdr_contep.csv"
rpdr_df <- read.csv(file=my_url, header=TRUE, stringsAsFactors=FALSE)
A lot of the summary statistics returned by the summary function are meaningless for this particular dataset, so below I calculate means/medians for two attributes I am interested in: the number of episodes per season and the ranks of the Miss Congeniality winners.
summary(rpdr_df)
## X season rank missc
## Min. : 1.0 Length:2320 Min. : 1.00 Min. :0.00000
## 1st Qu.: 580.8 Class :character 1st Qu.: 3.00 1st Qu.:0.00000
## Median :1160.5 Mode :character Median : 7.00 Median :0.00000
## Mean :1160.5 Mean : 7.08 Mean :0.07545
## 3rd Qu.:1740.2 3rd Qu.:10.00 3rd Qu.:0.00000
## Max. :2320.0 Max. :15.00 Max. :1.00000
## NA's :14 NA's :14
## contestant episode outcome eliminated
## Length:2320 Min. : 1.000 Length:2320 Min. :0.0000
## Class :character 1st Qu.: 4.000 Class :character 1st Qu.:0.0000
## Mode :character Median : 7.000 Mode :character Median :0.0000
## Mean : 7.132 Mean :0.0897
## 3rd Qu.:10.000 3rd Qu.:0.0000
## Max. :16.000 Max. :1.0000
## NA's :870
## participant minichalw finale penultimate
## Min. :0.000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :1.000 Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.625 Mean :0.0455 Mean :0.07931 Mean :0.01724
## 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.000 Max. :1.0000 Max. :1.00000 Max. :1.00000
## NA's :870
rpdr_df_temp <- rpdr_df
rpdr_df_temp %<>%
group_by(season) %>%
summarize(ep_count=max(episode))
head(rpdr_df_temp)
## # A tibble: 6 × 2
## season ep_count
## <chr> <int>
## 1 S01 8
## 2 S02 11
## 3 S03 15
## 4 S04 14
## 5 S05 14
## 6 S06 14
print("The mean number of episodes per season is:")
## [1] "The mean number of episodes per season is:"
mean(rpdr_df_temp$ep_count)
## [1] 13.42857
print("The median number of episodes per season is:")
## [1] "The median number of episodes per season is:"
median(rpdr_df_temp$ep_count)
## [1] 14
rpdr_df_temp <- rpdr_df
rpdr_df_temp %<>%
filter(missc == TRUE) %>% #display only Miss Congeniality winners
distinct(season, contestant, rank) #remove duplicate rows
head(rpdr_df_temp)
## season rank contestant
## 1 S01 2 Nina Flowers
## 2 S02 5 Pandora Boxx
## 3 S03 4 Yara Sofia
## 4 S04 4 Latrice Royale
## 5 S05 7 Ivy Winters
## 6 S06 5 BenDeLaCreme
print("The mean rank of the Miss Congeniality winners is:")
## [1] "The mean rank of the Miss Congeniality winners is:"
mean(rpdr_df_temp$rank)
## [1] 6.285714
print("The median rank of the Miss Congeniality winners is:")
## [1] "The median rank of the Miss Congeniality winners is:"
median(rpdr_df_temp$rank)
## [1] 5.5
rpdr_df_wins_only <- rpdr_df
rpdr_df_wins_only %<>%
select(season, contestant, rank, episode, outcome) %>%
# display only the winning outcomes
filter(outcome %in% c("WIN", "WIN+RTRN", "Winner")) %>%
arrange(rank, contestant)
head(rpdr_df_wins_only)
## season contestant rank episode outcome
## 1 S10 Aquaria 1 4 WIN
## 2 S10 Aquaria 1 7 WIN
## 3 S10 Aquaria 1 11 WIN
## 4 S10 Aquaria 1 14 WIN
## 5 S01 BeBe Zahara Benet 1 3 WIN
## 6 S01 BeBe Zahara Benet 1 6 WIN
colnames(rpdr_df_wins_only) <- c("SEAS", "CONT", "RANK", "EPIS", "OUTC")
head(rpdr_df_wins_only)
## SEAS CONT RANK EPIS OUTC
## 1 S10 Aquaria 1 4 WIN
## 2 S10 Aquaria 1 7 WIN
## 3 S10 Aquaria 1 11 WIN
## 4 S10 Aquaria 1 14 WIN
## 5 S01 BeBe Zahara Benet 1 3 WIN
## 6 S01 BeBe Zahara Benet 1 6 WIN
Again, the summary function isn’t particularly useful for this dataset. Neither the mean nor the median number of episodes should have changed for our subset of the data though, and I recalculated them below to confirm. They remain the same.
summary(rpdr_df_wins_only)
## SEAS CONT RANK EPIS
## Length:192 Length:192 Min. : 1.000 Min. : 1.000
## Class :character Class :character 1st Qu.: 1.000 1st Qu.: 3.000
## Mode :character Mode :character Median : 2.000 Median : 7.000
## Mean : 3.142 Mean : 6.781
## 3rd Qu.: 4.000 3rd Qu.:10.000
## Max. :12.000 Max. :16.000
## NA's :2
## OUTC
## Length:192
## Class :character
## Mode :character
##
##
##
##
rpdr_df_wins_only_temp <- rpdr_df_wins_only
rpdr_df_wins_only_temp %<>%
group_by(SEAS) %>%
summarize(EP_COUNT=max(EPIS))
head(rpdr_df_wins_only_temp)
## # A tibble: 6 × 2
## SEAS EP_COUNT
## <chr> <int>
## 1 S01 8
## 2 S02 11
## 3 S03 15
## 4 S04 14
## 5 S05 14
## 6 S06 14
print("The mean number of episodes per season is still:")
## [1] "The mean number of episodes per season is still:"
mean(rpdr_df_wins_only_temp$EP_COUNT)
## [1] 13.42857
print("The median number of episodes per season is still:")
## [1] "The median number of episodes per season is still:"
median(rpdr_df_wins_only_temp$EP_COUNT)
## [1] 14
Our subset of the data no longer includes the Miss Congeniality winner data, so let’s look at the mean number of episode wins for each season winner (where rank == 1) instead.
rpdr_df_wins_only_temp <- rpdr_df_wins_only
rpdr_df_wins_only_temp %<>%
filter(RANK == 1) %>% #display only the winners of each season
group_by(SEAS, CONT) %>%
summarize(WINS = n())
## `summarise()` has grouped output by 'SEAS'. You can override using the
## `.groups` argument.
colnames(rpdr_df_wins_only_temp) <- c("SEASON", "WINNER", "WINS")
head(rpdr_df_wins_only_temp)
## # A tibble: 6 × 3
## # Groups: SEASON [6]
## SEASON WINNER WINS
## <chr> <chr> <int>
## 1 S01 BeBe Zahara Benet 3
## 2 S02 Tyra Sanchez 4
## 3 S03 Raja 4
## 4 S04 Sharon Needles 5
## 5 S05 Jinkx Monsoon 3
## 6 S06 Bianca Del Rio 5
print("The mean number of episode wins for each season winner is:")
## [1] "The mean number of episode wins for each season winner is:"
mean(rpdr_df_wins_only_temp$WINS)
## [1] 3.928571
print("The median number of episode wins for each season winner is:")
## [1] "The median number of episode wins for each season winner is:"
median(rpdr_df_wins_only_temp$WINS)
## [1] 4
rpdr_df_ranks_renamed <- rpdr_df
rank_words <- c("FIRST", "SECOND", "THIRD", "FOURTH", "FIFTH",
"SIXTH", "SEVENTH", "EIGHTH", "NINTH", "TENTH",
"ELEVENTH", "TWELFTH", "THIRTEENTH",
"FOURTEENTH", "FIFTEENTH", "SIXTEENTH")
rpdr_df_ranks_renamed %<>%
mutate(rank = rank_words[rank])
head(rpdr_df_ranks_renamed)
## X season rank missc contestant episode outcome eliminated
## 1 1 S01 FIRST 0 BeBe Zahara Benet 1 SAFE 0
## 2 2 S01 SECOND 1 Nina Flowers 1 WIN 0
## 3 3 S01 THIRD 0 Rebecca Glasscock 1 LOW 0
## 4 4 S01 FOURTH 0 Shannel 1 SAFE 0
## 5 5 S01 FIFTH 0 Ongina 1 HIGH 0
## 6 6 S01 SIXTH 0 Jade 1 SAFE 0
## participant minichalw finale penultimate
## 1 1 0 0 0
## 2 1 0 0 0
## 3 1 0 0 0
## 4 1 0 0 0
## 5 1 0 0 0
## 6 1 0 0 0