library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
install.packages(“dplyr”)
athletes <- read.csv('athlete_events.csv')
#athletes
#Q1.1 Find the number of missing/NA values in each column
na_count <-sapply(athletes, function(y) sum(length(which(is.na(y)))))
na_count <- data.frame(na_count)
na_count
## na_count
## ID 0
## Name 0
## Sex 0
## Age 9474
## Height 60171
## Weight 62875
## Team 0
## NOC 0
## Games 0
## Year 0
## Season 0
## City 0
## Sport 0
## Event 0
## Medal 231333
#Q1.2 Remove all records with missing data
athletes.cleaned <- athletes[complete.cases(athletes), ]
#athletes.cleaned
#alt: y <- na.omit(athletes)
#Q2 In which year did the USA have the highest medal count?
athletes.cleaned.USA <- athletes.cleaned[athletes.cleaned$NOC=="USA",]
#athletes.cleaned.USA
#athletes.cleaned$Year[which.max()]
#aggregateMethod<-aggregate(Temp~Month,athletes.cleaned.USA,function(x) {c(MaxTemp=max(x),MinTemp=min(x))})
year.count<-aggregate(athletes.cleaned.USA$Medal, by=list(Year=athletes.cleaned.USA$Year), FUN=length)
#year.count
year.count[which.max(year.count$x),]
## Year x
## 21 1984 361
#Q3 How many countries have a total medal count higher than 10?
athletes.cleaned.countriesVSMedalCount <-aggregate(athletes.cleaned$Medal, by=list(Year=athletes.cleaned$NOC), FUN=length)
#athletes.cleaned.countriesVSMedalCount
NumGreaterTen<-nrow(athletes.cleaned.countriesVSMedalCount[athletes.cleaned.countriesVSMedalCount$x>10,])
NumGreaterTen
## [1] 86
#Q4
athletes.cleaned.season <-athletes.cleaned[ athletes.cleaned$Season=="Summer" | athletes.cleaned$Season=="Winter",]
#athletes.cleaned.season
athletes.cleaned.seasonAndGold<-athletes.cleaned.season[athletes.cleaned.season$Medal=="Gold",]
#athletes.cleaned.seasonAndGold
athletes.seasonAndGold.byCountry<-aggregate(athletes.cleaned.seasonAndGold$Medal, by=list(NOC=athletes.cleaned.seasonAndGold$NOC), FUN=length)
#athletes.seasonAndGold.byCountry
athletes.seasonAndGold.byCountry[which.max(athletes.seasonAndGold.byCountry$x),]
## NOC x
## 100 USA 2115
#Q5
#athletes.cleaned
#Countries vs. no. of gold medals
atheletes.cleaned.Gold <- athletes.cleaned[athletes.cleaned$Medal=="Gold",]
#atheletes.cleaned.Gold
atheletes.cleaned.countryVSGold <-aggregate(atheletes.cleaned.Gold$Medal, by=list(NOC=atheletes.cleaned.Gold$NOC), FUN=length)
#atheletes.cleaned.countryVSGold
#Countries vs. total metals
atheletes.cleaned.countryVStotalMedals <-aggregate(athletes.cleaned$Medal, by=list(NOC=athletes.cleaned$NOC), FUN=length)
#atheletes.cleaned.countryVStotalMedals
#Merge two dataframe
atheletes.cleaned.goldVSTotal <- merge(atheletes.cleaned.countryVSGold,atheletes.cleaned.countryVStotalMedals,by="NOC")
atheletes.cleaned.goldVSTotal<-mutate(atheletes.cleaned.goldVSTotal, perct=x.x/x.y*100)
#atheletes.cleaned.goldVSTotal
#atheletes.cleaned.goldVSTotal[which.max(atheletes.cleaned.goldVSTotal$perct),]
#tbl_df(atheletes.cleaned.goldVSTotal) %>% group_by(NOC) %>% top_n(5)
#sort
#atheletes.cleaned.goldVSTotal$perct
goldVSTotalSorted <- atheletes.cleaned.goldVSTotal[order(atheletes.cleaned.goldVSTotal$perct,decreasing = TRUE),]
goldVSTotalSorted[1:5,]
## NOC x.x x.y perct
## 32 FIJ 13 13 100.00000
## 52 JOR 1 1 100.00000
## 57 KOS 1 1 100.00000
## 18 CMR 16 18 88.88889
## 105 ZIM 17 22 77.27273
#Q6
#remove duplicate data based on name
athelete.names <- athletes.cleaned[!duplicated(athletes.cleaned$Name), ]
#athelete.names
athelete.BMI <- mutate(athelete.names, BMI=Weight/(Height^2))
#athelete.BMI
#get max min and avg for each country
maxBMI<- aggregate(athelete.BMI$BMI, by = list(athelete.BMI$NOC), max)
#maxBMI
minBMI<- aggregate(athelete.BMI$BMI, by = list(athelete.BMI$NOC), min)
#minBMI
meanBMI<-aggregate(athelete.BMI$BMI, by = list(athelete.BMI$NOC),mean)
#meanBMI
t1<-merge(maxBMI,minBMI,by="Group.1")
sumBMI<-merge(t1,meanBMI,by="Group.1")
#sumBMI
#rename cols
sumBMI<-sumBMI %>%
rename(
max = x.x,
min = x.y,
mean = x
)
#find top 10 avg. BMI countries
sumBMIsorted <- sumBMI[order(sumBMI$mean,decreasing = TRUE),]
sumBMIsorted[1:10,]
## Group.1 max min mean
## 123 TGA 0.004090577 0.004090577 0.004090577
## 75 KUW 0.002998359 0.002998359 0.002998359
## 77 LIB 0.003086420 0.002689232 0.002887826
## 64 ISV 0.002777778 0.002777778 0.002777778
## 40 FIJ 0.003333333 0.002514742 0.002776274
## 125 TJK 0.003431611 0.002343750 0.002766402
## 5 ARM 0.004320988 0.001988385 0.002727359
## 37 EST 0.003703704 0.002090420 0.002719885
## 47 GEO 0.004122755 0.001972318 0.002687314
## 44 GAB 0.002686981 0.002686981 0.002686981
#Q7
athelete.BMI2 <- mutate(athelete.names, BMI=Weight/(Height^2)*10000)
#athelete.BMI2
#assign labels
athelete.BMI2$BMIType <- ifelse(athelete.BMI2$BMI <18.5, "underweight", ifelse(athelete.BMI2$BMI<25,"normal",ifelse(athelete.BMI2$BMI<30,"overweight","obese")) )
#athelete.BMI2
overweight<-athelete.BMI2[athelete.BMI2$BMIType=="overweight",]
countryVSOverweight<-aggregate(overweight$BMIType, by=list(NOC=overweight$NOC), FUN=length)
#countryVSOverweight
countryVStotalAthelete<-aggregate(athelete.BMI$BMI, by=list(NOC=athelete.BMI$NOC), FUN=length)
#countryVStotalAthelete
#Merge two dataframe and calculate
overweightVStotal <- merge(countryVSOverweight,countryVStotalAthelete,by="NOC")
#overweightVStotal
overweightVStotal<-mutate(overweightVStotal, OverweightPercent=x.x/x.y*100)
#overweightVStotal
overweightVStotalSorted <- overweightVStotal[order(overweightVStotal$OverweightPercent,decreasing = TRUE),]
#not 100%
overweightVStotalSorted <-filter(overweightVStotalSorted,OverweightPercent!=100)
overweightVStotalSorted[1,]
## NOC x.x x.y OverweightPercent
## 1 FIJ 12 13 92.30769
#Q8
#athletes.cleaned
summer <- athletes.cleaned[athletes.cleaned$Season =="Summer",]
#summer
winter <- athletes.cleaned[athletes.cleaned$Season =="Winter",]
#winter
summerMedals <-aggregate(summer$Medal, by=list(NOC=summer$NOC), FUN=length)
#summerMedals
winterMedals <-aggregate(winter$Medal, by=list(NOC=winter$NOC), FUN=length)
#winterMedals
summerVSWinter <- merge(summerMedals,winterMedals,by="NOC")
#summerVSWinter
summerVSWinter<-summerVSWinter %>%
rename(
Summer = x.x,
Winter = x.y
)
summerVSWinter <- mutate(summerVSWinter, diff=Summer-Winter)
#summerVSWinter
#summary(summerVSWinter)
Answer to Q8:Yes, there are significant differences between the summer and winter olympics medal counts for countries. Most countries win more medals in the summer than in the winter. After calculating the difference between the medals won in the summer and the medals won in the winter, I found that the medals won in the summer are significantly higher than the medals won in the winter with only a few exceptions. However, I think the difference in number is too high that I don’t think I can just generalize it to be athletes perform better in summer than winter. Then I looked at the summary of this data and found that much more medals were won with a much higher mean in the summer than in the winter. The mean of medal counts overall is 400 higher in the summer than in the winter. Therefore, one possibility is that more medals are given out in the summer and maybe because there are more competitions in summer olympics.
One other fun fact is that all the countries that do better, i.e. win more medals in the winter olympics are countries in Europe and they are mostly neighbor countries such as Switzerland, Luxembourg and Austria. We can infer that people there are more used to the cold weather and thus perform better in the winter in comparison to other countries.