#named the chunk with its purpose but these names don't show up in html file
library(dplyr) #use library() during each session to load the packages you plan to use #I've already installed these packages
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'tibble' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.2 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(summarytools)
## Warning: package 'summarytools' was built under R version 4.3.3
##
## Attaching package: 'summarytools'
##
## The following object is masked from 'package:tibble':
##
## view
Text here will show up in the html file. #if you use a hash it creates a heading in the rmd file (this shows up as plain text in the html file)
#these data sets were downloaded from https://www.kaggle.com/datasets/muhammadehsan02/formula-1-world-championship-history-1950-2024/data
team_details<-read_csv("F1 World Champ History/Team_Details.csv")
## Rows: 212 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): constructorRef, name, nationality, url
## dbl (1): constructorId
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
constructor_rank<-read_csv("F1 World Champ History/Constructor_Rankings.csv")
## Rows: 13271 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): positionText
## dbl (6): constructorStandingsId, raceId, constructorId, points, position, wins
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pit_stops<-read_csv("F1 World Champ History/Pit_Stop_Records.csv")
## Rows: 10990 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): duration
## dbl (5): raceId, driverId, stop, lap, milliseconds
## time (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
driver_details<-read_csv("F1 World Champ History/Driver_Details.csv")
## Rows: 859 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): driverRef, number, code, forename, surname, nationality, url
## dbl (1): driverId
## date (1): dob
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
racesched<-read_csv("F1 World Champ History/Race_Schedule.csv")
## Rows: 1125 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): name, time, url, fp1_date, fp1_time, fp2_date, fp2_time, fp3_date...
## dbl (4): raceId, year, round, circuitId
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#anything to the left of the arrow (<-) tells R what to name an object so here we are telling R what to name each dataframe
#using summarytools package
dfSummary(constructor_rank) #maybe change this to constructor_rank
cr<-full_join(team_details, constructor_rank, by= "constructorId")
#there are several types of joins
#here we use full_join which keeps all observations in both dataframes
#by= tells R which variable it should use to match observations
library(mice) #this is a package for missing data tools
## Warning: package 'mice' was built under R version 4.3.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
md.pattern(cr, rotate.names = TRUE) #md.pattern shows us patterns of missingness #before the comma is the name of the dataframe
## constructorId constructorRef name nationality url constructorStandingsId
## 13271 1 1 1 1 1 1
## 52 1 1 1 1 1 0
## 0 0 0 0 0 52
## raceId points position positionText wins
## 13271 1 1 1 1 1 0
## 52 0 0 0 0 0 6
## 52 52 52 52 52 312
Each column is a varible and each row is a pattern of missing data- here we only have two patterns. Blue means data are present, so we have 13,271 observations with missing data on zero variables (see the 0 on the upper right). Red means data are missing, the next row shows that we have 52 observations with missing data on 6 variables.The zeroes and 52s along the bottom show us how many observations are missing data on each variable. 312 tells us how many cells are missing data in our dataframe.
#creating a dataframe with rows that are missing data after the merge, to do that we can use is.na()
#using subset from base R
cr_miss<-subset(cr, is.na(constructorStandingsId))
#using filter from dplyr
cr_miss1<-dplyr::filter(cr, is.na(constructorStandingsId))
#typing the name of the package and :: tells R to use a function from a particular package, helpful if you have multiple packages loaded that have the same function name
head(cr_miss) #shows the first 6 rows of the dataframe we just created and the first few variables
#dropping rows missing constructor standings data
#we can also use the exclamation mar to say we want rows that are NOT missing data for constructorStandingId
cr_clean<-subset(cr, !is.na(constructorStandingsId))
#drop url column
#for both methods we can us -c() which tells R the column name to remove
##using select from dplyr
racesched1<-select(racesched, -c(url))
##using subset
racesched2<-subset(racesched, select=-c(url))
#we can also select columns to keep rather than dropping a column
racesched3<-racesched[c(1:7, 9:18)]
#we don't need both dataframes so using rm() and list=c() to tell R to remove both dataframes
rm(list=c("racesched2", "racesched3"))
#now using left_join which keeps all observations in x (the first dataframe) and drops any observations in y (the second dataframe) that do not match any observations in x
#this join is also a bit more complicated in that we are using relationship=
#the type of relationship specified here is "many-to-one" which means that each row in x matches at most 1 row in y
cr_clean<-left_join(cr_clean, racesched1, by="raceId", relationship= "many-to-one")
#lets look at the column names of the dataframe we just created
colnames(cr_clean)
## [1] "constructorId" "constructorRef" "name.x"
## [4] "nationality" "url" "constructorStandingsId"
## [7] "raceId" "points" "position"
## [10] "positionText" "wins" "year"
## [13] "round" "circuitId" "name.y"
## [16] "date" "time" "fp1_date"
## [19] "fp1_time" "fp2_date" "fp2_time"
## [22] "fp3_date" "fp3_time" "quali_date"
## [25] "quali_time" "sprint_date" "sprint_time"
#now we have two variables named name, let's look at their values and see if we can rename them
head(cr_clean[ , c("name.x","name.y")])
cr_clean$const.name<-cr_clean$name.x #keep the original column with its original name
cr_clean1<-rename(cr_clean, c(constructor.name=name.x,
GP.name=name.y)) #use rename from dplyr to rename the columns, unlike the other option, this doesn't duplicate the columns
hist(pit_stops$stop, breaks = c(1, 2, 3, 4, 5, 6, 7, 70))
#create a dataframe that is just unique combinations of raceId, Gp.name, and date
##we are taking the cr_clean1 using the pipe operator to select variable names and then keep only the rows that have unique values for the selected variables and save the result as a new dataframe
cr_clean1a<-cr_clean1%>%
select(raceId, GP.name, date)%>%
distinct()
#merge the newly created df with the pit stops df
##here we are using an inner_join which only keeps observations from x that have a matching observation in y
##also using a different relationship, "one-to-many" meaning that each observation in x matches more than one observation in y
pit_stops1<-inner_join(cr_clean1a, pit_stops, by = "raceId", relationship = "one-to-many")
#merge in driver names, so let's join based on driverId
pit_stops2<-driver_details%>%
select(driverId, driverRef)%>%
right_join(pit_stops1, by = "driverId")
#creating dataframe with just four drivers
#ham is 1, bottas is 822, verstappen is 830, perez is 815
df<-pit_stops2%>%
filter(driverId %in% c(1, 822, 815, 830))
Now we have a dataframe that has every pit stop for each of the four drivers. Next, let’s create a dataframe for just one year’s worth of pit stops.
#let's look at the format of the date variable
head(df$date)
## [1] "2011-04-10" "2011-04-10" "2011-04-10" "2011-04-10" "2011-03-27"
## [6] "2011-03-27"
We could subset based on the existing date variable, but let’s imagine we want to create a new variable called year.
library(lubridate) #package for handling date variables
df$year<-year(df$date) #create new variable that takes just the year from the existing date variable
Now we have a variable with the full date and a variable with just the year.
#subset to just the 2021 season
df1<- df%>%
filter(year==2021) #using filter
Great! Now we have a dataframe for all of the pit stops for Hamilton, Bottas, Perez, and Verstappen during the 2021 season. However, we don’t have the team variable in this dataset, and next we want to look at the descriptive statistics for the pit stops broken down by team. Let’s create a team variable.
df2<-df1%>%
mutate(constructor = case_when( #this is saying create a new variable called constructor when the following is true
driverId %in% c(1, 822) ~ "Mercedes", #if the driverId is 1 or 822, constructor = Mercedes
driverId %in% c(815, 830) ~ "Red Bull", #if driverId is 815 or 830, constructor = Red Bull
TRUE ~ NA)) #if none of the conditions above are true, constructor = NA
freq(df2$constructor)#checking the frequencies
## Frequencies
## df2$constructor
## Type: Character
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## -------------- ------ --------- -------------- --------- --------------
## Mercedes 86 50.00 50.00 50.00 50.00
## Red Bull 86 50.00 100.00 50.00 100.00
## <NA> 0 0.00 100.00
## Total 172 100.00 100.00 100.00 100.00
Next, let’s look at pit stop duration by team
library(psych) #using psych just to show another descriptive stats function
## Warning: package 'psych' was built under R version 4.3.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
#using df2, describe the continuous variables separately for each team
describe(milliseconds +stop+ lap ~constructor, data=df2)
##
## Descriptive statistics by group
## constructor: Mercedes
## vars n mean sd median trimmed mad min max
## milliseconds 1 86 199073.74 507219.42 23441 53169.24 3302.49 15432 2049149
## stop 2 86 1.83 0.94 2 1.70 1.48 1 4
## lap 3 86 28.94 18.18 28 27.89 20.02 1 70
## range skew kurtosis se
## milliseconds 2033717 2.75 6.26 54694.88
## stop 3 0.86 -0.31 0.10
## lap 69 0.42 -0.55 1.96
## ------------------------------------------------------------
## constructor: Red Bull
## vars n mean sd median trimmed mad min max
## milliseconds 1 86 159193.14 441961.55 23521.0 24665.24 3306.94 15277 2076405
## stop 2 86 1.85 1.01 2.0 1.69 1.48 1 5
## lap 3 86 28.56 17.01 28.5 28.10 17.05 1 69
## range skew kurtosis se
## milliseconds 2061128 3.13 8.65 47657.94
## stop 4 1.18 0.82 0.11
## lap 68 0.18 -0.80 1.83
Let’s imagine that we are satisfied with our data cleaning and manipulation and now we want to create a scatterplot of laps and milliseconds
library(ggplot2)
ggplot(df2, aes(x = lap, y = milliseconds)) +
geom_point() + # Adds the scatter plot points
labs(
title = "Lap Time vs. Lap Number", # Title of the plot
x = "Lap Number", # Label for the x-axis
y = "Lap Time (milliseconds)" # Label for the y-axis
) +
scale_y_continuous(
) +
theme_minimal() # A clean, minimalist theme for the plot
We’ve got some outliers, let’s look at the more typical pit stops
ggplot(df2, aes(x = lap, y = milliseconds)) +
geom_point() + # Adds the scatter plot points
labs(
title = "Lap Time vs. Lap Number", # Title of the plot
x = "Lap Number", # Label for the x-axis
y = "Lap Time (milliseconds)" # Label for the y-axis
) +
scale_y_continuous(
limits = c(0, 100000), # Example: sets y-axis from 0 to 100,000
) +
theme_minimal() # A clean, minimalist theme for the plot
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).
This looks pretty flat, but let’s try estimating a correlation.
library(correlation) #using correlation packages
## Warning: package 'correlation' was built under R version 4.3.3
correlation(df2, select = c("lap", "milliseconds"))