R_Bridge_Week2_Assignment

Read the Data/Load Packages

library(knitr)
library(rmarkdown)
library(magrittr)
library(plyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

my_url <- "https://raw.githubusercontent.com/geedoubledee/R/main/rpdr_contep.csv"
rpdr_df <- read.csv(file=my_url, header=TRUE, stringsAsFactors=FALSE)

Display Summary of Data, Including Mean/Median for Two Attributes

A lot of the summary statistics returned by the summary function are meaningless for this particular dataset, so below I calculate means/medians for two attributes I am interested in: the number of episodes per season and the ranks of the Miss Congeniality winners.

summary(rpdr_df)

##        X             season               rank           missc        
##  Min.   :   1.0   Length:2320        Min.   : 1.00   Min.   :0.00000  
##  1st Qu.: 580.8   Class :character   1st Qu.: 3.00   1st Qu.:0.00000  
##  Median :1160.5   Mode  :character   Median : 7.00   Median :0.00000  
##  Mean   :1160.5                      Mean   : 7.08   Mean   :0.07545  
##  3rd Qu.:1740.2                      3rd Qu.:10.00   3rd Qu.:0.00000  
##  Max.   :2320.0                      Max.   :15.00   Max.   :1.00000  
##                                      NA's   :14      NA's   :14       
##   contestant           episode         outcome            eliminated    
##  Length:2320        Min.   : 1.000   Length:2320        Min.   :0.0000  
##  Class :character   1st Qu.: 4.000   Class :character   1st Qu.:0.0000  
##  Mode  :character   Median : 7.000   Mode  :character   Median :0.0000  
##                     Mean   : 7.132                      Mean   :0.0897  
##                     3rd Qu.:10.000                      3rd Qu.:0.0000  
##                     Max.   :16.000                      Max.   :1.0000  
##                                                         NA's   :870     
##   participant      minichalw          finale         penultimate     
##  Min.   :0.000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :1.000   Median :0.0000   Median :0.00000   Median :0.00000  
##  Mean   :0.625   Mean   :0.0455   Mean   :0.07931   Mean   :0.01724  
##  3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##                  NA's   :870

rpdr_df_temp <- rpdr_df

rpdr_df_temp %<>%
    group_by(season) %>%
    summarize(ep_count=max(episode))

head(rpdr_df_temp)

## # A tibble: 6 × 2
##   season ep_count
##   <chr>     <int>
## 1 S01           8
## 2 S02          11
## 3 S03          15
## 4 S04          14
## 5 S05          14
## 6 S06          14

print("The mean number of episodes per season is:")

## [1] "The mean number of episodes per season is:"

mean(rpdr_df_temp$ep_count)

## [1] 13.42857

print("The median number of episodes per season is:")

## [1] "The median number of episodes per season is:"

median(rpdr_df_temp$ep_count)

## [1] 14

rpdr_df_temp <- rpdr_df

rpdr_df_temp %<>%
    filter(missc == TRUE) %>% #display only Miss Congeniality winners
    distinct(season, contestant, rank) #remove duplicate rows

head(rpdr_df_temp)

##   season rank     contestant
## 1    S01    2   Nina Flowers
## 2    S02    5   Pandora Boxx
## 3    S03    4     Yara Sofia
## 4    S04    4 Latrice Royale
## 5    S05    7    Ivy Winters
## 6    S06    5   BenDeLaCreme

print("The mean rank of the Miss Congeniality winners is:")

## [1] "The mean rank of the Miss Congeniality winners is:"

mean(rpdr_df_temp$rank)

## [1] 6.285714

print("The median rank of the Miss Congeniality winners is:")

## [1] "The median rank of the Miss Congeniality winners is:"

median(rpdr_df_temp$rank)

## [1] 5.5

Create New Data Frame with Subset of Columns/Rows and Rename

rpdr_df_wins_only <- rpdr_df

rpdr_df_wins_only %<>%
    select(season, contestant, rank, episode, outcome) %>%
    # display only the winning outcomes
    filter(outcome %in% c("WIN", "WIN+RTRN", "Winner")) %>%
    arrange(rank, contestant)

head(rpdr_df_wins_only)

##   season        contestant rank episode outcome
## 1    S10           Aquaria    1       4     WIN
## 2    S10           Aquaria    1       7     WIN
## 3    S10           Aquaria    1      11     WIN
## 4    S10           Aquaria    1      14     WIN
## 5    S01 BeBe Zahara Benet    1       3     WIN
## 6    S01 BeBe Zahara Benet    1       6     WIN

Create New Column Names for the New Data Frame

colnames(rpdr_df_wins_only) <- c("SEAS", "CONT", "RANK", "EPIS", "OUTC")
head(rpdr_df_wins_only)

##   SEAS              CONT RANK EPIS OUTC
## 1  S10           Aquaria    1    4  WIN
## 2  S10           Aquaria    1    7  WIN
## 3  S10           Aquaria    1   11  WIN
## 4  S10           Aquaria    1   14  WIN
## 5  S01 BeBe Zahara Benet    1    3  WIN
## 6  S01 BeBe Zahara Benet    1    6  WIN

Compare Mean/Median of Attributes from New Data Frame to Old Data Frame

Again, the summary function isn’t particularly useful for this dataset. Neither the mean nor the median number of episodes should have changed for our subset of the data though, and I recalculated them below to confirm. They remain the same.

summary(rpdr_df_wins_only)

##      SEAS               CONT                RANK             EPIS       
##  Length:192         Length:192         Min.   : 1.000   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.: 1.000   1st Qu.: 3.000  
##  Mode  :character   Mode  :character   Median : 2.000   Median : 7.000  
##                                        Mean   : 3.142   Mean   : 6.781  
##                                        3rd Qu.: 4.000   3rd Qu.:10.000  
##                                        Max.   :12.000   Max.   :16.000  
##                                        NA's   :2                        
##      OUTC          
##  Length:192        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

rpdr_df_wins_only_temp <- rpdr_df_wins_only

rpdr_df_wins_only_temp %<>%
    group_by(SEAS) %>%
    summarize(EP_COUNT=max(EPIS))

head(rpdr_df_wins_only_temp)

## # A tibble: 6 × 2
##   SEAS  EP_COUNT
##   <chr>    <int>
## 1 S01          8
## 2 S02         11
## 3 S03         15
## 4 S04         14
## 5 S05         14
## 6 S06         14

print("The mean number of episodes per season is still:")

## [1] "The mean number of episodes per season is still:"

mean(rpdr_df_wins_only_temp$EP_COUNT)

## [1] 13.42857

print("The median number of episodes per season is still:")

## [1] "The median number of episodes per season is still:"

median(rpdr_df_wins_only_temp$EP_COUNT)

## [1] 14

Our subset of the data no longer includes the Miss Congeniality winner data, so let’s look at the mean number of episode wins for each season winner (where rank == 1) instead.

rpdr_df_wins_only_temp <- rpdr_df_wins_only

rpdr_df_wins_only_temp %<>%
    filter(RANK == 1) %>% #display only the winners of each season
    group_by(SEAS, CONT) %>%
    summarize(WINS = n())

## `summarise()` has grouped output by 'SEAS'. You can override using the
## `.groups` argument.

colnames(rpdr_df_wins_only_temp) <- c("SEASON", "WINNER", "WINS")
head(rpdr_df_wins_only_temp)

## # A tibble: 6 × 3
## # Groups:   SEASON [6]
##   SEASON WINNER             WINS
##   <chr>  <chr>             <int>
## 1 S01    BeBe Zahara Benet     3
## 2 S02    Tyra Sanchez          4
## 3 S03    Raja                  4
## 4 S04    Sharon Needles        5
## 5 S05    Jinkx Monsoon         3
## 6 S06    Bianca Del Rio        5

print("The mean number of episode wins for each season winner is:")

## [1] "The mean number of episode wins for each season winner is:"

mean(rpdr_df_wins_only_temp$WINS)

## [1] 3.928571

print("The median number of episode wins for each season winner is:")

## [1] "The median number of episode wins for each season winner is:"

median(rpdr_df_wins_only_temp$WINS)

## [1] 4

Rename the Values in One Column

rpdr_df_ranks_renamed <- rpdr_df

rank_words <- c("FIRST", "SECOND", "THIRD", "FOURTH", "FIFTH", 
                       "SIXTH", "SEVENTH", "EIGHTH", "NINTH", "TENTH", 
                       "ELEVENTH", "TWELFTH", "THIRTEENTH", 
                       "FOURTEENTH", "FIFTEENTH", "SIXTEENTH")

rpdr_df_ranks_renamed %<>%
    mutate(rank = rank_words[rank])

head(rpdr_df_ranks_renamed)

##   X season   rank missc        contestant episode outcome eliminated
## 1 1    S01  FIRST     0 BeBe Zahara Benet       1    SAFE          0
## 2 2    S01 SECOND     1      Nina Flowers       1     WIN          0
## 3 3    S01  THIRD     0 Rebecca Glasscock       1     LOW          0
## 4 4    S01 FOURTH     0           Shannel       1    SAFE          0
## 5 5    S01  FIFTH     0            Ongina       1    HIGH          0
## 6 6    S01  SIXTH     0              Jade       1    SAFE          0
##   participant minichalw finale penultimate
## 1           1         0      0           0
## 2           1         0      0           0
## 3           1         0      0           0
## 4           1         0      0           0
## 5           1         0      0           0
## 6           1         0      0           0