## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
In this assignment you will be getting a soft introduction to RMarkdown and practice data cleaning. You will start by practicing the functions that you learned in DataCamp. After showing mastery of these functions, you will then moved to a real data set and clean the data for Powerlifting World Championships.
Thus far, you have taken the introduction to R, intermediate R, and importing data lessons in DataCamp. It will be assumed in this lesson that you are comfortable with the material in those lessons. Throughout this assignment there will be conceptual lessons in R to complement what you are learning in R and to provide better understanding about what you are doing in R.
R Markdown is an industry standard way to quickly create data analysis reports in R. Within R Markdown you can display data and graphs, build models and quickly replicate processes for data analysis.
In R Markdown you can build headers with different heights # Header1 ## Header2 ### Header3 #### Header4 ##### Header5 ###### Header6
You can bold and italize
You can add block quotes (Make sure there is an empty line before quote):
Enter to Learn, Go Forth to serve
df <- read.csv("ugly_data.csv")
head(df)
## This.is.Ugly.Data.this.is.no.fun X X.1 X.2 X.3
## 1
## 2 Row Name Sex ID Time survey taken
## 3 1 John Smith M 182 4/1/2019 16:20
## 4 2 Jane Doe F 183 4/2/2019 8:20
## 5 3 Peter Parker M 184 6/3/2019 4:20
## 6 4 Jeff M 185 4/4/2019 4:40
## X.4 X.5 X.6 X.7
## 1
## 2 Finished Rate this Survey Text_Aint_Right Find me
## 3 TRUE 3 Can you fix this No
## 4 TRUE 6 what about this No
## 5 FALSE 3 let's try this No
## 6 TRUE 7 now look No
## X.8
## 1
## 2 Uppercase First letter
## 3 yes
## 4 yes
## 5 yes
## 6 yes
Now that is some ugly data! looking at the head of the table we see the the Column names are in the data and the first column should be the index. We need to fix this before we can start cleaning the data. Run the code chunk below so that we know what we can do with the read.csv function. Putting a ? before any function will bring up the documentation so that you know how to use it
df <- read.csv("ugly_data.csv",skip = 2)
df <- df[-c(1)]
head(df)
## Name Sex ID Time.survey.taken Finished Rate.this.Survey
## 1 John Smith M 182 4/1/2019 16:20 TRUE 3
## 2 Jane Doe F 183 4/2/2019 8:20 TRUE 6
## 3 Peter Parker M 184 6/3/2019 4:20 FALSE 3
## 4 Jeff M 185 4/4/2019 4:40 TRUE 7
## 5 ???? Samuels F 186 4/5/2019 4:20 TRUE 9
## 6 George Washington M 187 8/6/2019 0:20 FALSE 3
## Text_Aint_Right Find.me Uppercase.First.letter
## 1 Can you fix this No yes
## 2 what about this No yes
## 3 let's try this No yes
## 4 now look No yes
## 5 you are great Yes yes
## 6 keep going No yes
glimpse(df)
## Observations: 20
## Variables: 9
## $ Name <fct> John Smith, Jane Doe, Peter Parker, Jeff,…
## $ Sex <fct> M, F, M, M, F, M, M, F, M, M, F, F, M, M,…
## $ ID <int> 182, 183, 184, 185, 186, 187, 188, 189, 1…
## $ Time.survey.taken <fct> 4/1/2019 16:20, 4/2/2019 8:20, 6/3/2019 4…
## $ Finished <lgl> TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRU…
## $ Rate.this.Survey <int> 3, 6, 3, 7, 9, 3, 1, 5, 3, 7, 4, 3, NA, 6…
## $ Text_Aint_Right <fct> " Can you fix this", "what about t…
## $ Find.me <fct> No, No, No, No, Yes, No, No, Yes, No, Yes…
## $ Uppercase.First.letter <fct> yes, yes, yes, yes, yes, yes, yes, yes, y…
Use the as.
df$Name <- as.character(df$Name)
df$Text_Aint_Right <- as.character(df$Text_Aint_Right)
df$Uppercase.First.letter <- as.character(df$Uppercase.First.letter)
df$Find.me <- as.character(df$Find.me )
glimpse(df)
## Observations: 20
## Variables: 9
## $ Name <chr> "John Smith", "Jane Doe", "Peter Parker",…
## $ Sex <fct> M, F, M, M, F, M, M, F, M, M, F, F, M, M,…
## $ ID <int> 182, 183, 184, 185, 186, 187, 188, 189, 1…
## $ Time.survey.taken <fct> 4/1/2019 16:20, 4/2/2019 8:20, 6/3/2019 4…
## $ Finished <lgl> TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRU…
## $ Rate.this.Survey <int> 3, 6, 3, 7, 9, 3, 1, 5, 3, 7, 4, 3, NA, 6…
## $ Text_Aint_Right <chr> " Can you fix this", "what about t…
## $ Find.me <chr> "No", "No", "No", "No", "Yes", "No", "No"…
## $ Uppercase.First.letter <chr> "yes", "yes", "yes", "yes", "yes", "yes",…
df_name <- separate(df, col=Name, into= c("First Name", "Last Name"))
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [4].
head(df_name)
## First Name Last Name Sex ID Time.survey.taken Finished
## 1 John Smith M 182 4/1/2019 16:20 TRUE
## 2 Jane Doe F 183 4/2/2019 8:20 TRUE
## 3 Peter Parker M 184 6/3/2019 4:20 FALSE
## 4 Jeff <NA> M 185 4/4/2019 4:40 TRUE
## 5 Samuels F 186 4/5/2019 4:20 TRUE
## 6 George Washington M 187 8/6/2019 0:20 FALSE
## Rate.this.Survey Text_Aint_Right Find.me
## 1 3 Can you fix this No
## 2 6 what about this No
## 3 3 let's try this No
## 4 7 now look No
## 5 9 you are great Yes
## 6 3 keep going No
## Uppercase.First.letter
## 1 yes
## 2 yes
## 3 yes
## 4 yes
## 5 yes
## 6 yes
df_name$Time.survey <- as.Date(df_name$Time.survey)
glimpse(df_name)
## Observations: 20
## Variables: 11
## $ `First Name` <chr> "John", "Jane", "Peter", "Jeff", "", "Geo…
## $ `Last Name` <chr> "Smith", "Doe", "Parker", NA, "Samuels", …
## $ Sex <fct> M, F, M, M, F, M, M, F, M, M, F, F, M, M,…
## $ ID <int> 182, 183, 184, 185, 186, 187, 188, 189, 1…
## $ Time.survey.taken <fct> 4/1/2019 16:20, 4/2/2019 8:20, 6/3/2019 4…
## $ Finished <lgl> TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRU…
## $ Rate.this.Survey <int> 3, 6, 3, 7, 9, 3, 1, 5, 3, 7, 4, 3, NA, 6…
## $ Text_Aint_Right <chr> " Can you fix this", "what about t…
## $ Find.me <chr> "No", "No", "No", "No", "Yes", "No", "No"…
## $ Uppercase.First.letter <chr> "yes", "yes", "yes", "yes", "yes", "yes",…
## $ Time.survey <date> 4-01-20, 4-02-20, 6-03-20, 4-04-20, 4-05…
df_name$Text_Aint_Right <- str_trim(df_name$Text_Aint_Right)
head(df_name)
## First Name Last Name Sex ID Time.survey.taken Finished
## 1 John Smith M 182 4/1/2019 16:20 TRUE
## 2 Jane Doe F 183 4/2/2019 8:20 TRUE
## 3 Peter Parker M 184 6/3/2019 4:20 FALSE
## 4 Jeff <NA> M 185 4/4/2019 4:40 TRUE
## 5 Samuels F 186 4/5/2019 4:20 TRUE
## 6 George Washington M 187 8/6/2019 0:20 FALSE
## Rate.this.Survey Text_Aint_Right Find.me Uppercase.First.letter
## 1 3 Can you fix this No yes
## 2 6 what about this No yes
## 3 3 let's try this No yes
## 4 7 now look No yes
## 5 9 you are great Yes yes
## 6 3 keep going No yes
## Time.survey
## 1 4-01-20
## 2 4-02-20
## 3 6-03-20
## 4 4-04-20
## 5 4-05-20
## 6 8-06-20
df_name$ID <- str_pad(c(df_name$ID), width= 9, side = "left", pad = 0)
head(df_name)
## First Name Last Name Sex ID Time.survey.taken Finished
## 1 John Smith M 000000182 4/1/2019 16:20 TRUE
## 2 Jane Doe F 000000183 4/2/2019 8:20 TRUE
## 3 Peter Parker M 000000184 6/3/2019 4:20 FALSE
## 4 Jeff <NA> M 000000185 4/4/2019 4:40 TRUE
## 5 Samuels F 000000186 4/5/2019 4:20 TRUE
## 6 George Washington M 000000187 8/6/2019 0:20 FALSE
## Rate.this.Survey Text_Aint_Right Find.me Uppercase.First.letter
## 1 3 Can you fix this No yes
## 2 6 what about this No yes
## 3 3 let's try this No yes
## 4 7 now look No yes
## 5 9 you are great Yes yes
## 6 3 keep going No yes
## Time.survey
## 1 4-01-20
## 2 4-02-20
## 3 6-03-20
## 4 4-04-20
## 5 4-05-20
## 6 8-06-20
str_detect(c (df_name$Find.me),"Yes")
## [1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE
## [12] TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE TRUE
df_name$Sex <- str_replace(c(df_name$Sex), "1", "Female")
df_name$Sex <- str_replace(c(df_name$Sex), "2", "Male")
head(df_name$Sex)
## [1] "Male" "Female" "Male" "Male" "Female" "Male"
head(df_name)
## First Name Last Name Sex ID Time.survey.taken Finished
## 1 John Smith Male 000000182 4/1/2019 16:20 TRUE
## 2 Jane Doe Female 000000183 4/2/2019 8:20 TRUE
## 3 Peter Parker Male 000000184 6/3/2019 4:20 FALSE
## 4 Jeff <NA> Male 000000185 4/4/2019 4:40 TRUE
## 5 Samuels Female 000000186 4/5/2019 4:20 TRUE
## 6 George Washington Male 000000187 8/6/2019 0:20 FALSE
## Rate.this.Survey Text_Aint_Right Find.me Uppercase.First.letter
## 1 3 Can you fix this No yes
## 2 6 what about this No yes
## 3 3 let's try this No yes
## 4 7 now look No yes
## 5 9 you are great Yes yes
## 6 3 keep going No yes
## Time.survey
## 1 4-01-20
## 2 4-02-20
## 3 6-03-20
## 4 4-04-20
## 5 4-05-20
## 6 8-06-20
df_name$Uppercase.First.letter <- toupper(df_name$Uppercase.First.letter)
df_name$Finished<- tolower(df_name$Finished)
head(df_name)
## First Name Last Name Sex ID Time.survey.taken Finished
## 1 John Smith Male 000000182 4/1/2019 16:20 true
## 2 Jane Doe Female 000000183 4/2/2019 8:20 true
## 3 Peter Parker Male 000000184 6/3/2019 4:20 false
## 4 Jeff <NA> Male 000000185 4/4/2019 4:40 true
## 5 Samuels Female 000000186 4/5/2019 4:20 true
## 6 George Washington Male 000000187 8/6/2019 0:20 false
## Rate.this.Survey Text_Aint_Right Find.me Uppercase.First.letter
## 1 3 Can you fix this No YES
## 2 6 what about this No YES
## 3 3 let's try this No YES
## 4 7 now look No YES
## 5 9 you are great Yes YES
## 6 3 keep going No YES
## Time.survey
## 1 4-01-20
## 2 4-02-20
## 3 6-03-20
## 4 4-04-20
## 5 4-05-20
## 6 8-06-20
summary(df_name)
## First Name Last Name Sex
## Length:20 Length:20 Length:20
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## ID Time.survey.taken Finished
## Length:20 1/2/2019 4:20 : 1 Length:20
## Class :character 12/9/2019 18:20: 1 Class :character
## Mode :character 2/14/2019 10:20: 1 Mode :character
## 4/1/2019 16:20 : 1
## 4/10/2019 4:20 : 1
## 4/11/2019 9:20 : 1
## (Other) :14
## Rate.this.Survey Text_Aint_Right Find.me
## Min. :1.000 Length:20 Length:20
## 1st Qu.:3.000 Class :character Class :character
## Median :4.000 Mode :character Mode :character
## Mean :4.737
## 3rd Qu.:6.500
## Max. :9.000
## NA's :1
## Uppercase.First.letter Time.survey
## Length:20 Min. :1-02-20
## Class :character 1st Qu.:4-04-05
## Mode :character Median :4-08-04
## Mean :5-05-10
## 3rd Qu.:5-03-21
## Max. :12-09-20
## NA's :8
is.na(c(df_name))
## First Name Last Name Sex
## FALSE FALSE FALSE
## ID Time.survey.taken Finished
## FALSE FALSE FALSE
## Rate.this.Survey Text_Aint_Right Find.me
## FALSE FALSE FALSE
## Uppercase.First.letter Time.survey
## FALSE FALSE
sum(is.na(df_name))
## [1] 10
df_omit <- na.omit(df_name)
glimpse(df_omit)
## Observations: 11
## Variables: 11
## $ `First Name` <chr> "John", "Jane", "Peter", "", "George", "B…
## $ `Last Name` <chr> "Smith", "Doe", "Parker", "Samuels", "Was…
## $ Sex <chr> "Male", "Female", "Male", "Female", "Male…
## $ ID <chr> "000000182", "000000183", "000000184", "0…
## $ Time.survey.taken <fct> 4/1/2019 16:20, 4/2/2019 8:20, 6/3/2019 4…
## $ Finished <chr> "true", "true", "false", "true", "false",…
## $ Rate.this.Survey <int> 3, 6, 3, 9, 3, 1, 5, 3, 7, 4, 3
## $ Text_Aint_Right <chr> "Can you fix this", "what about this", "l…
## $ Find.me <chr> "No", "No", "No", "Yes", "No", "No", "Yes…
## $ Uppercase.First.letter <chr> "YES", "YES", "YES", "YES", "YES", "YES",…
## $ Time.survey <date> 4-01-20, 4-02-20, 6-03-20, 4-05-20, 8-06…
cleaned_df <- df_omit[-c(4),]
head(cleaned_df)
## First Name Last Name Sex ID Time.survey.taken Finished
## 1 John Smith Male 000000182 4/1/2019 16:20 true
## 2 Jane Doe Female 000000183 4/2/2019 8:20 true
## 3 Peter Parker Male 000000184 6/3/2019 4:20 false
## 6 George Washington Male 000000187 8/6/2019 0:20 false
## 7 Bruce Willis Male 000000188 4/7/2019 4:20 true
## 8 Kristie Smith Female 000000189 4/8/2019 0:20 true
## Rate.this.Survey Text_Aint_Right Find.me Uppercase.First.letter
## 1 3 Can you fix this No YES
## 2 6 what about this No YES
## 3 3 let's try this No YES
## 6 3 keep going No YES
## 7 1 boring fake data No YES
## 8 5 great work, you did it Yes YES
## Time.survey
## 1 4-01-20
## 2 4-02-20
## 3 6-03-20
## 6 8-06-20
## 7 4-07-20
## 8 4-08-20
wc <- read.csv("world_championship.csv")
head(wc)
## Name Sex Event Equipment Age AgeClass Division
## 1 Adolfo Moreno M SBD Single-ply NA Open
## 2 Adolfo Roman M B Single-ply 29 24-34 Open
## 3 Adrian Parra M SBD Single-ply NA Teen U19
## 4 Adriana Bialostocki F SBD Single-ply 25 24-34 Open
## 5 Adriana Bialostocki F B Single-ply 25 24-34 Open
## 6 Alan Aerts M SBD Single-ply 51 50-54 M50+
## BodyweightKg WeightClassKg Squat1Kg Squat2Kg Squat3Kg Squat4Kg
## 1 74.05 75 190 200 210 NA
## 2 89.25 90 NA NA NA NA
## 3 78.55 82.5 -210 215 230 NA
## 4 67.45 67.5 -100 -100 -100 NA
## 5 75.00 75 NA NA NA NA
## 6 125.55 140 -140 140 165 NA
## Best3SquatKg Bench1Kg Bench2Kg Bench3Kg Bench4Kg Best3BenchKg
## 1 210 125.0 130.0 -135.0 NA 130.0
## 2 NA 130.0 140.0 150.0 NA 150.0
## 3 230 110.0 -120.0 -120.0 NA 110.0
## 4 NA NA NA NA NA NA
## 5 NA 75.0 -80.0 -80.0 NA 75.0
## 6 165 190.5 202.5 -212.5 NA 202.5
## Deadlift1Kg Deadlift2Kg Deadlift3Kg Deadlift4Kg Best3DeadliftKg TotalKg
## 1 220 230 245.0 NA 245.0 585
## 2 NA NA NA NA NA 150
## 3 -200 200 -205.0 NA 200.0 540
## 4 NA NA NA NA NA NA
## 5 NA NA NA NA NA 75
## 6 170 200 227.5 NA 227.5 595
## Place Wilks McCulloch Glossbrenner IPFPoints Tested Country Federation
## 1 3 420.60 420.60 406.97 531.46 Mexico WPF
## 2 1 96.17 96.17 92.69 477.99 Mexico WPF
## 3 1 373.00 373.00 360.45 477.86 Mexico WPF
## 4 DQ NA NA NA NA Germany WPF
## 5 2 71.30 71.30 62.72 460.80 Germany WPF
## 6 1 338.76 388.56 322.60 407.13 USA WPF
## Date MeetCountry MeetState
## 1 11/22/2007 Mexico
## 2 11/22/2007 Mexico
## 3 11/22/2007 Mexico
## 4 11/22/2007 Mexico
## 5 11/22/2007 Mexico
## 6 11/22/2007 Mexico
## MeetName
## 1 World Powerlifting and Bench Press Championships
## 2 World Powerlifting and Bench Press Championships
## 3 World Powerlifting and Bench Press Championships
## 4 World Powerlifting and Bench Press Championships
## 5 World Powerlifting and Bench Press Championships
## 6 World Powerlifting and Bench Press Championships
glimpse(wc)
## Observations: 68,627
## Variables: 37
## $ Name <fct> Adolfo Moreno, Adolfo Roman, Adrian Parra, Adria…
## $ Sex <fct> M, M, M, F, F, M, M, M, M, F, M, M, M, F, M, M, …
## $ Event <fct> SBD, B, SBD, SBD, B, SBD, B, B, SBD, SBD, SBD, S…
## $ Equipment <fct> Single-ply, Single-ply, Single-ply, Single-ply, …
## $ Age <dbl> NA, 29.0, NA, 25.0, 25.0, 51.0, 51.0, 43.0, NA, …
## $ AgeClass <fct> , 24-34, , 24-34, 24-34, 50-54, 50-54, 40-44, , …
## $ Division <fct> Open, Open, Teen U19, Open, Open, M50+, M50+, M4…
## $ BodyweightKg <dbl> 74.05, 89.25, 78.55, 67.45, 75.00, 125.55, 126.0…
## $ WeightClassKg <fct> 75, 90, 82.5, 67.5, 75, 140, 140, 110, 82.5, 82.…
## $ Squat1Kg <dbl> 190.0, NA, -210.0, -100.0, NA, -140.0, NA, NA, 1…
## $ Squat2Kg <dbl> 200.0, NA, 215.0, -100.0, NA, 140.0, NA, NA, 205…
## $ Squat3Kg <dbl> 210.0, NA, 230.0, -100.0, NA, 165.0, NA, NA, 210…
## $ Squat4Kg <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Best3SquatKg <dbl> 210.0, NA, 230.0, NA, NA, 165.0, NA, NA, 210.0, …
## $ Bench1Kg <dbl> 125.0, 130.0, 110.0, NA, 75.0, 190.5, 75.0, 170.…
## $ Bench2Kg <dbl> 130.0, 140.0, -120.0, NA, -80.0, 202.5, 100.0, 1…
## $ Bench3Kg <dbl> -135.0, 150.0, -120.0, NA, -80.0, -212.5, 132.5,…
## $ Bench4Kg <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Best3BenchKg <dbl> 130.0, 150.0, 110.0, NA, 75.0, 202.5, 132.5, 182…
## $ Deadlift1Kg <dbl> 220.0, NA, -200.0, NA, NA, 170.0, NA, NA, 160.0,…
## $ Deadlift2Kg <dbl> 230.0, NA, 200.0, NA, NA, 200.0, NA, NA, -180.0,…
## $ Deadlift3Kg <dbl> 245.0, NA, -205.0, NA, NA, 227.5, NA, NA, 190.0,…
## $ Deadlift4Kg <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Best3DeadliftKg <dbl> 245.0, NA, 200.0, NA, NA, 227.5, NA, NA, 190.0, …
## $ TotalKg <dbl> 585.0, 150.0, 540.0, NA, 75.0, 595.0, 132.5, 182…
## $ Place <fct> 3, 1, 1, DQ, 2, 1, 1, 1, 1, 1, 2, 1, DQ, 1, DQ, …
## $ Wilks <dbl> 420.60, 96.17, 373.00, NA, 71.30, 338.76, 75.38,…
## $ McCulloch <dbl> 420.60, 96.17, 373.00, NA, 71.30, 388.56, 86.47,…
## $ Glossbrenner <dbl> 406.97, 92.69, 360.45, NA, 62.72, 322.60, 71.78,…
## $ IPFPoints <dbl> 531.46, 477.99, 477.86, NA, 460.80, 407.13, 373.…
## $ Tested <fct> , , , , , , , , , , , , , , , , , , , , , , , ,
## $ Country <fct> Mexico, Mexico, Mexico, Germany, Germany, USA, U…
## $ Federation <fct> WPF, WPF, WPF, WPF, WPF, WPF, WPF, WPF, WPF, WPF…
## $ Date <fct> 11/22/2007, 11/22/2007, 11/22/2007, 11/22/2007, …
## $ MeetCountry <fct> Mexico, Mexico, Mexico, Mexico, Mexico, Mexico, …
## $ MeetState <fct> , , , , , , , , , , , , , , , , , , , , , , , ,
## $ MeetName <fct> World Powerlifting and Bench Press Championships…
str(wc)
## 'data.frame': 68627 obs. of 37 variables:
## $ Name : Factor w/ 37017 levels "A Aliev","A Almeida",..: 452 453 480 492 492 683 683 736 752 775 ...
## $ Sex : Factor w/ 2 levels "F","M": 2 2 2 1 1 2 2 2 2 1 ...
## $ Event : Factor w/ 7 levels "B","BD","D","S",..: 6 1 6 6 1 6 1 1 6 6 ...
## $ Equipment : Factor w/ 5 levels "Multi-ply","Raw",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Age : num NA 29 NA 25 25 51 51 43 NA 30 ...
## $ AgeClass : Factor w/ 17 levels "","12-May","13-15",..: 1 7 1 7 7 11 11 9 1 7 ...
## $ Division : Factor w/ 885 levels "","10-Sep","12-Nov",..: 659 659 817 659 659 542 542 536 544 659 ...
## $ BodyweightKg : num 74 89.2 78.5 67.5 75 ...
## $ WeightClassKg : Factor w/ 102 levels "","100","100+",..: 80 97 87 70 80 24 24 10 87 87 ...
## $ Squat1Kg : num 190 NA -210 -100 NA -140 NA NA 180 -185 ...
## $ Squat2Kg : num 200 NA 215 -100 NA 140 NA NA 205 195 ...
## $ Squat3Kg : num 210 NA 230 -100 NA 165 NA NA 210 -200 ...
## $ Squat4Kg : num NA NA NA NA NA NA NA NA NA NA ...
## $ Best3SquatKg : num 210 NA 230 NA NA 165 NA NA 210 195 ...
## $ Bench1Kg : num 125 130 110 NA 75 ...
## $ Bench2Kg : num 130 140 -120 NA -80 ...
## $ Bench3Kg : num -135 150 -120 NA -80 ...
## $ Bench4Kg : num NA NA NA NA NA NA NA NA NA NA ...
## $ Best3BenchKg : num 130 150 110 NA 75 ...
## $ Deadlift1Kg : num 220 NA -200 NA NA 170 NA NA 160 185 ...
## $ Deadlift2Kg : num 230 NA 200 NA NA 200 NA NA -180 200 ...
## $ Deadlift3Kg : num 245 NA -205 NA NA ...
## $ Deadlift4Kg : num NA NA NA NA NA NA NA NA NA NA ...
## $ Best3DeadliftKg: num 245 NA 200 NA NA ...
## $ TotalKg : num 585 150 540 NA 75 ...
## $ Place : Factor w/ 33 levels "1","10","11",..: 23 1 1 31 12 1 1 1 1 1 ...
## $ Wilks : num 420.6 96.2 373 NA 71.3 ...
## $ McCulloch : num 420.6 96.2 373 NA 71.3 ...
## $ Glossbrenner : num 407 92.7 360.4 NA 62.7 ...
## $ IPFPoints : num 531 478 478 NA 461 ...
## $ Tested : Factor w/ 2 levels "","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Country : Factor w/ 127 levels "","Algeria","Argentina",..: 68 68 68 37 37 120 120 75 68 68 ...
## $ Federation : Factor w/ 35 levels "AAU","APA","APF",..: 30 30 30 30 30 30 30 30 30 30 ...
## $ Date : Factor w/ 376 levels "1/12/2005","1/24/1998",..: 112 112 112 112 112 112 112 112 112 112 ...
## $ MeetCountry : Factor w/ 43 levels "Argentina","Australia",..: 25 25 25 25 25 25 25 25 25 25 ...
## $ MeetState : Factor w/ 41 levels "","AB","AK","AL",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ MeetName : Factor w/ 127 levels "24 Hour Fitness World Bench Press and Deadlift",..: 102 102 102 102 102 102 102 102 102 102 ...
hist(wc$Age, xlab = "Age", ylab= "Number of Weight Lifter", main = "Histogram of Age", col = 'red')
plot(wc$Best3BenchKg, wc$Best3DeadliftKg)
summary(wc)
## Name Sex Event Equipment
## Martin Drake : 52 F:16268 B :24045 Multi-ply : 9662
## Duane Burlingame: 41 M:52359 BD : 1308 Raw :21310
## Gordon Santee : 38 D : 5588 Single-ply:32914
## Hiroyuki Isagawa: 36 S : 245 Straps : 7
## Dariusz Wszoła : 34 SB : 3 Wraps : 4734
## Jan Wegiera : 34 SBD:37433
## (Other) :68392 SD : 5
## Age AgeClass Division BodyweightKg
## Min. : 0.00 24-34 :14623 Open :22019 Min. : 21.60
## 1st Qu.:23.00 :13352 Juniors : 5743 1st Qu.: 67.40
## Median :33.00 20-23 : 8464 Masters 1 : 3615 Median : 82.50
## Mean :35.63 40-44 : 6050 Masters 2 : 2911 Mean : 85.84
## 3rd Qu.:45.50 35-39 : 4980 Sub-Juniors: 2285 3rd Qu.:100.00
## Max. :95.00 45-49 : 4285 O : 2031 Max. :234.10
## NA's :14898 (Other):16873 (Other) :30023 NA's :975
## WeightClassKg Squat1Kg Squat2Kg Squat3Kg
## 90 : 7256 Min. :-551.1 Min. :-551.12 Min. :-551.12
## 82.5 : 6772 1st Qu.: 90.0 1st Qu.:-142.50 1st Qu.:-220.00
## 100 : 6568 Median : 172.5 Median : 162.50 Median : 105.00
## 75 : 6360 Mean : 117.2 Mean : 80.58 Mean : 9.84
## 110 : 5530 3rd Qu.: 240.0 3rd Qu.: 240.00 3rd Qu.: 222.50
## 67.5 : 5239 Max. : 480.0 Max. : 502.50 Max. : 490.00
## (Other):30902 NA's :47708 NA's :47984 NA's :48507
## Squat4Kg Best3SquatKg Bench1Kg Bench2Kg
## Min. :-472.50 Min. :-440.0 Min. :-480.00 Min. :-500.50
## 1st Qu.:-145.00 1st Qu.: 152.5 1st Qu.: 62.50 1st Qu.: -97.50
## Median : 140.00 Median : 210.0 Median : 120.00 Median : 107.50
## Mean : 56.54 Mean : 211.7 Mean : 87.16 Mean : 52.23
## 3rd Qu.: 215.00 3rd Qu.: 265.0 3rd Qu.: 170.00 3rd Qu.: 167.25
## Max. : 467.20 Max. : 502.5 Max. : 415.00 Max. : 390.00
## NA's :68110 NA's :32500 NA's :32299 NA's :32712
## Bench3Kg Bench4Kg Best3BenchKg Deadlift1Kg
## Min. :-545.00 Min. :-440.00 Min. :-490.0 Min. :-461.0
## 1st Qu.:-165.00 1st Qu.:-147.50 1st Qu.: 100.0 1st Qu.: 140.0
## Median : -80.00 Median : 83.20 Median : 142.5 Median : 200.0
## Mean : -26.41 Mean : 17.01 Mean : 144.4 Mean : 173.7
## 3rd Qu.: 137.50 3rd Qu.: 155.00 3rd Qu.: 185.0 3rd Qu.: 250.0
## Max. : 400.00 Max. : 350.00 Max. : 415.0 Max. : 425.0
## NA's :33624 NA's :67369 NA's :9578 NA's :45581
## Deadlift2Kg Deadlift3Kg Deadlift4Kg Best3DeadliftKg
## Min. :-435.0 Min. :-440.0 Min. :-461.0 Min. :-350.0
## 1st Qu.: 110.0 1st Qu.:-250.0 1st Qu.:-200.0 1st Qu.: 167.5
## Median : 190.0 Median :-145.0 Median : 137.5 Median : 220.0
## Mean : 119.7 Mean : -35.1 Mean : 37.9 Mean : 217.7
## 3rd Qu.: 252.5 3rd Qu.: 205.0 3rd Qu.: 210.0 3rd Qu.: 267.5
## Max. : 440.0 Max. : 420.0 Max. : 400.0 Max. : 454.1
## NA's :46101 NA's :47187 NA's :67622 NA's :26720
## TotalKg Place Wilks McCulloch
## Min. : 2.5 1 :27674 Min. : 1.47 Min. : 1.47
## 1st Qu.: 180.0 2 :10926 1st Qu.:122.13 1st Qu.:130.74
## Median : 325.0 3 : 6590 Median :295.37 Median :320.54
## Mean : 389.6 DQ : 4900 Mean :282.24 Mean :297.02
## 3rd Qu.: 590.0 4 : 4538 3rd Qu.:426.48 3rd Qu.:446.32
## Max. :1230.0 5 : 3379 Max. :742.66 Max. :742.66
## NA's :5054 (Other):10620 NA's :5944 NA's :5957
## Glossbrenner IPFPoints Tested Country
## Min. : 1.41 Min. : 9.18 :23978 USA :14901
## 1st Qu.:115.89 1st Qu.: 485.29 Yes:44649 :12922
## Median :273.98 Median : 554.86 Russia : 7674
## Mean :266.27 Mean : 556.01 Germany: 2848
## 3rd Qu.:403.39 3rd Qu.: 625.92 UK : 2593
## Max. :668.91 Max. :1239.61 Japan : 2061
## NA's :5944 NA's :7260 (Other):25628
## Federation Date MeetCountry MeetState
## IPF :26697 12/9/2003 : 1116 USA :27542 :42690
## WPC : 7836 10/17/2014: 1085 Russia : 8001 NV : 7764
## IPL : 4862 10/4/2018 : 1055 Czechia : 3071 FL : 2489
## WRPF : 3995 8/20/2014 : 915 South Africa: 2939 PA : 2244
## AAU : 3353 10/25/2017: 909 Germany : 2473 TX : 1259
## WNPF : 3346 11/9/2015 : 834 Canada : 2155 CA : 1245
## (Other):18538 (Other) :62713 (Other) :22446 (Other):10936
## MeetName
## World Championships :15312
## World Bench Press Championships : 5411
## World Powerlifting Championships : 3848
## World Masters Powerlifting Championships: 3483
## World Cup : 2803
## Men's World Powerlifting Championships : 2753
## (Other) :35017
We see here that the there are a bunch of different attempts at Squat, Deadlift, and Bench. Each of these have a min in the negative numbers. This seems like a mistake, but we’ll deal with that later.
20.For the sake of this assignment we are only concerned with the best for the Squat, Deadlift and Bench. Subset the data so that does not include the attempts at the weights, only the best of the 4 attempts.
bestof <- wc[-c(10:13, 15:18, 20:23)]
powerlift_name <- separate(bestof,1, c("First Name", "Last Name"), sep=" ")
## Warning: Expected 2 pieces. Additional pieces discarded in 2727 rows [39,
## 54, 55, 59, 78, 162, 174, 183, 241, 242, 254, 263, 273, 284, 321, 328, 359,
## 426, 433, 436, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 2161 rows
## [1850, 1851, 1852, 1853, 1855, 1858, 1859, 1860, 1861, 1862, 1863, 1864,
## 1865, 1866, 1867, 1868, 1869, 1870, 1871, 1872, ...].
levels(powerlift_name$Sex) <- c(levels(powerlift_name$Sex), "Female")
powerlift_name$Sex[powerlift_name$Sex=="F"] <- "Female"
levels(powerlift_name$Sex) <- c(levels(powerlift_name$Sex), "Male")
powerlift_name$Sex[powerlift_name$Sex=="M"] <- "Male"
str(powerlift_name)
## 'data.frame': 68627 obs. of 26 variables:
## $ First Name : chr "Adolfo" "Adolfo" "Adrian" "Adriana" ...
## $ Last Name : chr "Moreno" "Roman" "Parra" "Bialostocki" ...
## $ Sex : Factor w/ 4 levels "F","M","Female",..: 4 4 4 3 3 4 4 4 4 3 ...
## $ Event : Factor w/ 7 levels "B","BD","D","S",..: 6 1 6 6 1 6 1 1 6 6 ...
## $ Equipment : Factor w/ 5 levels "Multi-ply","Raw",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Age : num NA 29 NA 25 25 51 51 43 NA 30 ...
## $ AgeClass : Factor w/ 17 levels "","12-May","13-15",..: 1 7 1 7 7 11 11 9 1 7 ...
## $ Division : Factor w/ 885 levels "","10-Sep","12-Nov",..: 659 659 817 659 659 542 542 536 544 659 ...
## $ BodyweightKg : num 74 89.2 78.5 67.5 75 ...
## $ WeightClassKg : Factor w/ 102 levels "","100","100+",..: 80 97 87 70 80 24 24 10 87 87 ...
## $ Best3SquatKg : num 210 NA 230 NA NA 165 NA NA 210 195 ...
## $ Best3BenchKg : num 130 150 110 NA 75 ...
## $ Best3DeadliftKg: num 245 NA 200 NA NA ...
## $ TotalKg : num 585 150 540 NA 75 ...
## $ Place : Factor w/ 33 levels "1","10","11",..: 23 1 1 31 12 1 1 1 1 1 ...
## $ Wilks : num 420.6 96.2 373 NA 71.3 ...
## $ McCulloch : num 420.6 96.2 373 NA 71.3 ...
## $ Glossbrenner : num 407 92.7 360.4 NA 62.7 ...
## $ IPFPoints : num 531 478 478 NA 461 ...
## $ Tested : Factor w/ 2 levels "","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Country : Factor w/ 127 levels "","Algeria","Argentina",..: 68 68 68 37 37 120 120 75 68 68 ...
## $ Federation : Factor w/ 35 levels "AAU","APA","APF",..: 30 30 30 30 30 30 30 30 30 30 ...
## $ Date : Factor w/ 376 levels "1/12/2005","1/24/1998",..: 112 112 112 112 112 112 112 112 112 112 ...
## $ MeetCountry : Factor w/ 43 levels "Argentina","Australia",..: 25 25 25 25 25 25 25 25 25 25 ...
## $ MeetState : Factor w/ 41 levels "","AB","AK","AL",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ MeetName : Factor w/ 127 levels "24 Hour Fitness World Bench Press and Deadlift",..: 102 102 102 102 102 102 102 102 102 102 ...
completeFun <- function(data, desiredCols) {
completeVec <- complete.cases(data[, desiredCols])
return(data[completeVec, ])
}
no_NA <- completeFun(powerlift_name, c("Age", "AgeClass", "Best3SquatKg", "Best3BenchKg", "Best3DeadliftKg", "TotalKg"))
plot(no_NA$Best3BenchKg ~ no_NA$Best3DeadliftKg)
cleaned <- no_NA[c("Best3DeadliftKg", "Best3BenchKg" , "Best3SquatKg" , "TotalKg") >=0,]
Notice now that from the time that you imported the Powerlifting dataset to now, you have reduced the data from 68627 observations to 27385 observations that are usable for us.
Run the following code to create a new data frame that will count the number of weight lifters from the each country.
country <- as.data.frame(table(no_NA$Country))
head(country)
## Var1 Freq
## 1 2844
## 2 Algeria 34
## 3 Argentina 64
## 4 Armenia 14
## 5 Australia 407
## 6 Austria 311
To show you the power of R, run the next code to create an interactive bar chart of the previous data.
p <- plot_ly(
x = country$Var1,
y = country$Freq,
name = "Where do weightlifters come from?",
type = "bar"
)
p
Happy Coding.