Data Cleaning for Olympic medalists data

medalists <- read.csv("medallists.csv")

Remove extra columns

medalists1 <- medalists[, c(2, 4, 5, 7, 14)]

Fix capitalization miscues in names

library(stringr)
medalists1$name <- str_to_title(medalists1$name)

Add back Russia instead of “AIN”

which(medalists1 == "AIN") #3: identify
[1] 7180 7184 7216 7654 8375 8376
medalists1[which(medalists1$country == "AIN"), "country"] <- "Russia"
library(dplyr)

Add a column for continent

medalists1 <- medalists1 %>%
  mutate(continent = case_when(
    country %in% c("United States", "Canada", "Mexico", "Puerto Rico", "Jamaica", "Saint Lucia", 
                   "Cuba", "Guatemala", "Dominican Republic", "Grenada", "Dominica", "Panama") 
    ~ "North America",
    country %in% c("Brazil", "Argentina", "Chile", "Colombia", "Peru", "Venezuela", "Ecuador", "Bolivia", "Paraguay", "Uruguay", "Guyana", "Suriname", "French Guiana") 
    ~ "South America",
    country %in% c("Great Britain", "Germany", "France", "Italy", "Spain", "Russia", "Poland", "Netherlands", "Belgium", "Sweden", "Ukraine", "Switzerland", "Greece", "Portugal", "Czechia", "Denmark", "Romania", "Hungary", "Norway", "Finland", "Bulgaria", "Serbia", "Croatia", "Slovakia", "Slovenia", "Lithuania", "Latvia", "Estonia", "Belarus", "Iceland", "Albania", "North Macedonia", "Republic of Moldova", "Kosovo", "Monaco", "San Marino", "Austria", "Andorra", "Liechtenstein", "Malta", "Ireland", "Cyprus") 
    ~ "Europe",
    country %in% c("China", "Hong Kong, China", "Chinese Taipei", "India", "Japan", "Korea", "DPR Korea", "Indonesia", "Pakistan", "Bangladesh", "Vietnam", "Philippines", "Thailand", "Malaysia", "Singapore", "Myanmar", "Cambodia", "Sri Lanka", "Nepal", "Afghanistan", "IR Iran", "Iraq", "Israel", "Jordan", "Kuwait", "Lebanon", "Oman", "Qatar", "Bahrain", "United Arab Emirates", "Yemen", "Kazakhstan", "Uzbekistan", "Turkmenistan", "Türkiye", "Kyrgyzstan", "Tajikistan", "Armenia", "Georgia", "Azerbaijan", "Mongolia") 
    ~ "Asia",
    country %in% c("South Africa", "Egypt", "Kenya", "Nigeria", "Ethiopia", "Tanzania", "Uganda", "Ghana", "Morocco", "Algeria", "Ivory Coast", "Senegal", "Cameroon", "Zimbabwe", "Angola", "Mali", "Zambia", "Rwanda", "Botswana", "Tunisia", "Sierra Leone", "Côte d'Ivoire", "Liberia", "Mauritius", "Namibia", "Lesotho", "Swaziland", "Cape Verde", "Gabon", "Seychelles", "Comoros", "Equatorial Guinea", "Burkina Faso", "Madagascar", "Mozambique", "Central African Republic", "Republic of the Congo", "Democratic Republic of the Congo", "Chad", "Eritrea", "Somalia", "Sudan", "South Sudan", "Mauritania", "Benin", "Togo", "Burundi", "Guinea", "Congo", "Malawi", "Gambia", "Guinea-Bissau", "Sao Tome and Principe", "Cabo Verde") 
    ~ "Africa",
    country %in% c("Australia", "New Zealand", "Papua New Guinea", "Fiji", "Samoa", "Tonga", "Vanuatu", "Solomon Islands", "Kiribati", "Tuvalu", "Marshall Islands", "Palau", "Nauru") 
    ~ "Oceania",
    country %in% c("EOR") ~ "Refugee"
  ))

function to find what values we missed in the continent column

get_na_countries <- function(medalists1) {
  # Filter the rows where 'continent' column has N/A values
  na_countries <- medalists1[is.na(medalists1$continent), "country"]
  
  # Return the countries with N/A values in the continent column
  return(na_countries)
}

# Call the function with your data (medalists1)
na_countries <- get_na_countries(medalists1)

# Print the result
print(na_countries)
character(0)
write.csv(medalists1, "medalists_final.csv")
summary(medalists1$country)
   Length     Class      Mode 
     2315 character character 
which(medalists1$country == "Puerto Rico")
[1] 657 745
LS0tCnRpdGxlOiAiRGF0YSBDbGVhbmluZyBmb3IgRmluYWwgUHJvamVjdCIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQojIERhdGEgQ2xlYW5pbmcgZm9yIE9seW1waWMgbWVkYWxpc3RzIGRhdGEKYGBge3J9Cm1lZGFsaXN0cyA8LSByZWFkLmNzdigibWVkYWxsaXN0cy5jc3YiKQpgYGAKCiMjIFJlbW92ZSBleHRyYSBjb2x1bW5zCmBgYHtyfQptZWRhbGlzdHMxIDwtIG1lZGFsaXN0c1ssIGMoMiwgNCwgNSwgNywgMTQpXQpgYGAKCiMjIEZpeCBjYXBpdGFsaXphdGlvbiBtaXNjdWVzIGluIG5hbWVzCmBgYHtyfQpsaWJyYXJ5KHN0cmluZ3IpCm1lZGFsaXN0czEkbmFtZSA8LSBzdHJfdG9fdGl0bGUobWVkYWxpc3RzMSRuYW1lKQpgYGAKCiMjIEFkZCBiYWNrIFJ1c3NpYSBpbnN0ZWFkIG9mICJBSU4iCmBgYHtyfQp3aGljaChtZWRhbGlzdHMxID09ICJBSU4iKSAjMzogaWRlbnRpZnkKbWVkYWxpc3RzMVt3aGljaChtZWRhbGlzdHMxJGNvdW50cnkgPT0gIkFJTiIpLCAiY291bnRyeSJdIDwtICJSdXNzaWEiCmBgYAoKYGBge3J9CmxpYnJhcnkoZHBseXIpCmBgYAoKIyMgQWRkIGEgY29sdW1uIGZvciBjb250aW5lbnQKYGBge3J9Cm1lZGFsaXN0czEgPC0gbWVkYWxpc3RzMSAlPiUKICBtdXRhdGUoY29udGluZW50ID0gY2FzZV93aGVuKAogICAgY291bnRyeSAlaW4lIGMoIlVuaXRlZCBTdGF0ZXMiLCAiQ2FuYWRhIiwgIk1leGljbyIsICJQdWVydG8gUmljbyIsICJKYW1haWNhIiwgIlNhaW50IEx1Y2lhIiwgCiAgICAgICAgICAgICAgICAgICAiQ3ViYSIsICJHdWF0ZW1hbGEiLCAiRG9taW5pY2FuIFJlcHVibGljIiwgIkdyZW5hZGEiLCAiRG9taW5pY2EiLCAiUGFuYW1hIikgCiAgICB+ICJOb3J0aCBBbWVyaWNhIiwKICAgIGNvdW50cnkgJWluJSBjKCJCcmF6aWwiLCAiQXJnZW50aW5hIiwgIkNoaWxlIiwgIkNvbG9tYmlhIiwgIlBlcnUiLCAiVmVuZXp1ZWxhIiwgIkVjdWFkb3IiLCAiQm9saXZpYSIsICJQYXJhZ3VheSIsICJVcnVndWF5IiwgIkd1eWFuYSIsICJTdXJpbmFtZSIsICJGcmVuY2ggR3VpYW5hIikgCiAgICB+ICJTb3V0aCBBbWVyaWNhIiwKICAgIGNvdW50cnkgJWluJSBjKCJHcmVhdCBCcml0YWluIiwgIkdlcm1hbnkiLCAiRnJhbmNlIiwgIkl0YWx5IiwgIlNwYWluIiwgIlJ1c3NpYSIsICJQb2xhbmQiLCAiTmV0aGVybGFuZHMiLCAiQmVsZ2l1bSIsICJTd2VkZW4iLCAiVWtyYWluZSIsICJTd2l0emVybGFuZCIsICJHcmVlY2UiLCAiUG9ydHVnYWwiLCAiQ3plY2hpYSIsICJEZW5tYXJrIiwgIlJvbWFuaWEiLCAiSHVuZ2FyeSIsICJOb3J3YXkiLCAiRmlubGFuZCIsICJCdWxnYXJpYSIsICJTZXJiaWEiLCAiQ3JvYXRpYSIsICJTbG92YWtpYSIsICJTbG92ZW5pYSIsICJMaXRodWFuaWEiLCAiTGF0dmlhIiwgIkVzdG9uaWEiLCAiQmVsYXJ1cyIsICJJY2VsYW5kIiwgIkFsYmFuaWEiLCAiTm9ydGggTWFjZWRvbmlhIiwgIlJlcHVibGljIG9mIE1vbGRvdmEiLCAiS29zb3ZvIiwgIk1vbmFjbyIsICJTYW4gTWFyaW5vIiwgIkF1c3RyaWEiLCAiQW5kb3JyYSIsICJMaWVjaHRlbnN0ZWluIiwgIk1hbHRhIiwgIklyZWxhbmQiLCAiQ3lwcnVzIikgCiAgICB+ICJFdXJvcGUiLAogICAgY291bnRyeSAlaW4lIGMoIkNoaW5hIiwgIkhvbmcgS29uZywgQ2hpbmEiLCAiQ2hpbmVzZSBUYWlwZWkiLCAiSW5kaWEiLCAiSmFwYW4iLCAiS29yZWEiLCAiRFBSIEtvcmVhIiwgIkluZG9uZXNpYSIsICJQYWtpc3RhbiIsICJCYW5nbGFkZXNoIiwgIlZpZXRuYW0iLCAiUGhpbGlwcGluZXMiLCAiVGhhaWxhbmQiLCAiTWFsYXlzaWEiLCAiU2luZ2Fwb3JlIiwgIk15YW5tYXIiLCAiQ2FtYm9kaWEiLCAiU3JpIExhbmthIiwgIk5lcGFsIiwgIkFmZ2hhbmlzdGFuIiwgIklSIElyYW4iLCAiSXJhcSIsICJJc3JhZWwiLCAiSm9yZGFuIiwgIkt1d2FpdCIsICJMZWJhbm9uIiwgIk9tYW4iLCAiUWF0YXIiLCAiQmFocmFpbiIsICJVbml0ZWQgQXJhYiBFbWlyYXRlcyIsICJZZW1lbiIsICJLYXpha2hzdGFuIiwgIlV6YmVraXN0YW4iLCAiVHVya21lbmlzdGFuIiwgIlTDvHJraXllIiwgIkt5cmd5enN0YW4iLCAiVGFqaWtpc3RhbiIsICJBcm1lbmlhIiwgIkdlb3JnaWEiLCAiQXplcmJhaWphbiIsICJNb25nb2xpYSIpIAogICAgfiAiQXNpYSIsCiAgICBjb3VudHJ5ICVpbiUgYygiU291dGggQWZyaWNhIiwgIkVneXB0IiwgIktlbnlhIiwgIk5pZ2VyaWEiLCAiRXRoaW9waWEiLCAiVGFuemFuaWEiLCAiVWdhbmRhIiwgIkdoYW5hIiwgIk1vcm9jY28iLCAiQWxnZXJpYSIsICJJdm9yeSBDb2FzdCIsICJTZW5lZ2FsIiwgIkNhbWVyb29uIiwgIlppbWJhYndlIiwgIkFuZ29sYSIsICJNYWxpIiwgIlphbWJpYSIsICJSd2FuZGEiLCAiQm90c3dhbmEiLCAiVHVuaXNpYSIsICJTaWVycmEgTGVvbmUiLCAiQ8O0dGUgZCdJdm9pcmUiLCAiTGliZXJpYSIsICJNYXVyaXRpdXMiLCAiTmFtaWJpYSIsICJMZXNvdGhvIiwgIlN3YXppbGFuZCIsICJDYXBlIFZlcmRlIiwgIkdhYm9uIiwgIlNleWNoZWxsZXMiLCAiQ29tb3JvcyIsICJFcXVhdG9yaWFsIEd1aW5lYSIsICJCdXJraW5hIEZhc28iLCAiTWFkYWdhc2NhciIsICJNb3phbWJpcXVlIiwgIkNlbnRyYWwgQWZyaWNhbiBSZXB1YmxpYyIsICJSZXB1YmxpYyBvZiB0aGUgQ29uZ28iLCAiRGVtb2NyYXRpYyBSZXB1YmxpYyBvZiB0aGUgQ29uZ28iLCAiQ2hhZCIsICJFcml0cmVhIiwgIlNvbWFsaWEiLCAiU3VkYW4iLCAiU291dGggU3VkYW4iLCAiTWF1cml0YW5pYSIsICJCZW5pbiIsICJUb2dvIiwgIkJ1cnVuZGkiLCAiR3VpbmVhIiwgIkNvbmdvIiwgIk1hbGF3aSIsICJHYW1iaWEiLCAiR3VpbmVhLUJpc3NhdSIsICJTYW8gVG9tZSBhbmQgUHJpbmNpcGUiLCAiQ2FibyBWZXJkZSIpIAogICAgfiAiQWZyaWNhIiwKICAgIGNvdW50cnkgJWluJSBjKCJBdXN0cmFsaWEiLCAiTmV3IFplYWxhbmQiLCAiUGFwdWEgTmV3IEd1aW5lYSIsICJGaWppIiwgIlNhbW9hIiwgIlRvbmdhIiwgIlZhbnVhdHUiLCAiU29sb21vbiBJc2xhbmRzIiwgIktpcmliYXRpIiwgIlR1dmFsdSIsICJNYXJzaGFsbCBJc2xhbmRzIiwgIlBhbGF1IiwgIk5hdXJ1IikgCiAgICB+ICJPY2VhbmlhIiwKICAgIGNvdW50cnkgJWluJSBjKCJFT1IiKSB+ICJSZWZ1Z2VlIgogICkpCmBgYAoKIyMgZnVuY3Rpb24gdG8gZmluZCB3aGF0IHZhbHVlcyB3ZSBtaXNzZWQgaW4gdGhlIGNvbnRpbmVudCBjb2x1bW4KYGBge3J9CmdldF9uYV9jb3VudHJpZXMgPC0gZnVuY3Rpb24obWVkYWxpc3RzMSkgewogICMgRmlsdGVyIHRoZSByb3dzIHdoZXJlICdjb250aW5lbnQnIGNvbHVtbiBoYXMgTi9BIHZhbHVlcwogIG5hX2NvdW50cmllcyA8LSBtZWRhbGlzdHMxW2lzLm5hKG1lZGFsaXN0czEkY29udGluZW50KSwgImNvdW50cnkiXQogIAogICMgUmV0dXJuIHRoZSBjb3VudHJpZXMgd2l0aCBOL0EgdmFsdWVzIGluIHRoZSBjb250aW5lbnQgY29sdW1uCiAgcmV0dXJuKG5hX2NvdW50cmllcykKfQoKIyBDYWxsIHRoZSBmdW5jdGlvbiB3aXRoIHlvdXIgZGF0YSAobWVkYWxpc3RzMSkKbmFfY291bnRyaWVzIDwtIGdldF9uYV9jb3VudHJpZXMobWVkYWxpc3RzMSkKCiMgUHJpbnQgdGhlIHJlc3VsdApwcmludChuYV9jb3VudHJpZXMpCmBgYAoKCmBgYHtyfQp3cml0ZS5jc3YobWVkYWxpc3RzMSwgIm1lZGFsaXN0c19maW5hbC5jc3YiKQpgYGAKCmBgYHtyfQpzdW1tYXJ5KG1lZGFsaXN0czEkY291bnRyeSkKYGBgCgpgYGB7cn0Kd2hpY2gobWVkYWxpc3RzMSRjb3VudHJ5ID09ICJQdWVydG8gUmljbyIpCmBgYAoK