OVERVIEW The Data I will be working with for this assignment is halloween candy power ranking, i will show which type of candy has the highest percentage of winning. my data was taken from https://github.com/fivethirtyeight/data/tree/master/candy-power-ranking

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(ggplot2)
options(readr.show_col_types = FALSE)

Loading Data from github file

Candy <- "~/Data_607/candy-data.csv"
candy_data <- read_csv(Candy)

head(candy_data)

Renaming a few columns

candy_data <- candy_data %>%
  rename(
    CandyName = competitorname,
    Chocolate = chocolate,
    WinPercent = winpercent
  )
head(candy_data)

Changing the values in the chocolate column from 1’s or 0’s to “Yes” or “No”

candy_data$Chocolate <- ifelse(candy_data$Chocolate, "Yes", "No")
head(candy_data)

Selecting relevant columns for camparison

candy_subset <- candy_data %>%
  select(CandyName, Chocolate, WinPercent)

Creating a chart comparing chocolate win percent to candy without chocolate

ggplot(candy_subset, aes(x = Chocolate, y = WinPercent, fill = Chocolate)) +
  geom_boxplot() +
  labs(
    title = "Comparison of Chocolate vs. Non-Chocolate Candy Win Percent",
    x = "Contains Chocolate",
    y = "Win Percent"
  ) +
  theme_minimal()

# Conclusion: Candies that contains chocolate has the highest percentage of winning.