Importing libraries
library(readr) #importing data files
library(dplyr) #used for data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr) #used for unite function to combine cols
library(tidyverse) #using for merge function
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ purrr 1.0.1
## ✔ ggplot2 3.4.3 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych) #for describing data frame : similar to summary
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ggplot2)
library(rAmCharts4)
Importing CSV files
raw_circuits <- read_csv("C:/Users/kesha/Desktop/DataSets/f1 data/circuits.csv")
## Rows: 77 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): circuitRef, name, location, country, alt, url
## dbl (3): circuitId, lat, lng
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_constructor_results <- read_csv("C:/Users/kesha/Desktop/DataSets/f1 data/constructor_results.csv")
## Rows: 12170 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): status
## dbl (4): constructorResultsId, raceId, constructorId, points
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_constructor_standings <- read_csv("C:/Users/kesha/Desktop/DataSets/f1 data/constructor_standings.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 12941 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (7): constructorStandingsId, raceId, constructorId, points, position, po...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_constructors <- read_csv("C:/Users/kesha/Desktop/DataSets/f1 data/constructors.csv")
## Rows: 211 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): constructorRef, name, nationality, url
## dbl (1): constructorId
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_driver_standings <- read_csv("C:/Users/kesha/Desktop/DataSets/f1 data/driver_standings.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 33902 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (7): driverStandingsId, raceId, driverId, points, position, positionText...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_drivers <- read_csv("C:/Users/kesha/Desktop/DataSets/f1 data/drivers.csv")
## Rows: 857 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): driverRef, number, code, forename, surname, nationality, url
## dbl (1): driverId
## date (1): dob
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_lap_times <- read_csv("C:/Users/kesha/Desktop/DataSets/f1 data/lap_times.csv")
## Rows: 538121 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): raceId, driverId, lap, position, milliseconds
## time (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_pit_stops <- read_csv("C:/Users/kesha/Desktop/DataSets/f1 data/pit_stops.csv")
## Rows: 9634 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): duration
## dbl (5): raceId, driverId, stop, lap, milliseconds
## time (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_qyalifying <- read_csv("C:/Users/kesha/Desktop/DataSets/f1 data/qualifying.csv")
## Rows: 9575 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): q1, q2, q3
## dbl (6): qualifyId, raceId, driverId, constructorId, number, position
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_races <- read_csv("C:/Users/kesha/Desktop/DataSets/f1 data/races.csv")
## Rows: 1102 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): name, date, time, url, fp1_date, fp1_time, fp2_date, fp2_time, fp3...
## dbl (4): raceId, year, round, circuitId
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_results <- read_csv("C:/Users/kesha/Desktop/DataSets/f1 data/results.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 25840 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): position, positionText, time, milliseconds, fastestLap, rank, fast...
## dbl (10): resultId, raceId, driverId, constructorId, number, grid, positionO...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_seasons <- read_csv("C:/Users/kesha/Desktop/DataSets/f1 data/seasons.csv")
## Rows: 74 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): url
## dbl (1): year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_sprint_results <- read_csv("C:/Users/kesha/Desktop/DataSets/f1 data/sprint_results.csv")
## Rows: 120 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): position, positionText, time, milliseconds, fastestLap, fastestLap...
## dbl (10): resultId, raceId, driverId, constructorId, number, grid, positionO...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_status <- read_csv("C:/Users/kesha/Desktop/DataSets/f1 data/status.csv")
## Rows: 139 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): status
## dbl (1): statusId
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Viewing the dataset
View(raw_drivers)
Checking if dataset is loaded properly
head(raw_results,n=20)
## # A tibble: 20 × 18
## resultId raceId driverId constructorId number grid position positionText
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 1 18 1 1 22 1 "1" 1
## 2 2 18 2 2 3 5 "2" 2
## 3 3 18 3 3 7 7 "3" 3
## 4 4 18 4 4 5 11 "4" 4
## 5 5 18 5 1 23 3 "5" 5
## 6 6 18 6 3 8 13 "6" 6
## 7 7 18 7 5 14 17 "7" 7
## 8 8 18 8 6 1 15 "8" 8
## 9 9 18 9 2 4 2 "\\N" R
## 10 10 18 10 7 12 18 "\\N" R
## 11 11 18 11 8 18 19 "\\N" R
## 12 12 18 12 4 6 20 "\\N" R
## 13 13 18 13 6 2 4 "\\N" R
## 14 14 18 14 9 9 8 "\\N" R
## 15 15 18 15 7 11 6 "\\N" R
## 16 16 18 16 10 20 22 "\\N" R
## 17 17 18 17 9 10 14 "\\N" R
## 18 18 18 18 11 16 12 "\\N" R
## 19 19 18 19 8 19 21 "\\N" R
## 20 20 18 20 5 15 9 "\\N" R
## # ℹ 10 more variables: positionOrder <dbl>, points <dbl>, laps <dbl>,
## # time <chr>, milliseconds <chr>, fastestLap <chr>, rank <chr>,
## # fastestLapTime <chr>, fastestLapSpeed <chr>, statusId <dbl>
Information about “Result” dataset
results_summary <- summary(raw_results)
results_summary
## resultId raceId driverId constructorId
## Min. : 1 Min. : 1.0 Min. : 1.0 Min. : 1.00
## 1st Qu.: 6461 1st Qu.: 293.0 1st Qu.: 56.0 1st Qu.: 6.00
## Median :12920 Median : 514.0 Median :163.0 Median : 25.00
## Mean :12921 Mean : 531.4 Mean :261.7 Mean : 48.63
## 3rd Qu.:19380 3rd Qu.: 784.0 3rd Qu.:360.0 3rd Qu.: 58.00
## Max. :25845 Max. :1096.0 Max. :856.0 Max. :214.00
##
## number grid position positionText
## Min. : 0.00 Min. : 0.00 Length:25840 Length:25840
## 1st Qu.: 7.00 1st Qu.: 5.00 Class :character Class :character
## Median : 15.00 Median :11.00 Mode :character Mode :character
## Mean : 17.79 Mean :11.18
## 3rd Qu.: 24.00 3rd Qu.:17.00
## Max. :208.00 Max. :34.00
## NA's :6
## positionOrder points laps time
## Min. : 1.00 Min. : 0.000 Min. : 0.00 Length:25840
## 1st Qu.: 6.00 1st Qu.: 0.000 1st Qu.: 22.00 Class :character
## Median :12.00 Median : 0.000 Median : 52.00 Mode :character
## Mean :12.88 Mean : 1.877 Mean : 45.98
## 3rd Qu.:18.00 3rd Qu.: 2.000 3rd Qu.: 66.00
## Max. :39.00 Max. :50.000 Max. :200.00
##
## milliseconds fastestLap rank fastestLapTime
## Length:25840 Length:25840 Length:25840 Length:25840
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## fastestLapSpeed statusId
## Length:25840 Min. : 1.00
## Class :character 1st Qu.: 1.00
## Mode :character Median : 11.00
## Mean : 17.57
## 3rd Qu.: 14.00
## Max. :141.00
##
Making copies of datasets above
races <- raw_races
circuits <- raw_circuits
drivers <- raw_drivers
constructors <- raw_constructors
results <- raw_results
Dropping columns from the datasets which won’t be in use
races <- races[,!names(races) %in% c("url",
"fp1_date","fp1_time",
"fp2_date","fp2_time",
"fp3_date","fp3_time",
"quali_date","quali_time",
"sprint_date","sprint_time",
"time")]
circuits <- circuits[,!names(circuits) %in% c("lat","lng",
"alt","url")]
drivers <- drivers[,!names(drivers) %in% c("driverRef","number",
"code","url")]
constructors <- constructors[,!names(constructors) %in% c("url","constructorRef")]
Combining forename and surname of Drivers
drivers <- unite(drivers, driver_name,
c(forename,surname,))
drivers
## # A tibble: 857 × 4
## driverId driver_name dob nationality
## <dbl> <chr> <date> <chr>
## 1 1 Lewis_Hamilton 1985-01-07 British
## 2 2 Nick_Heidfeld 1977-05-10 German
## 3 3 Nico_Rosberg 1985-06-27 German
## 4 4 Fernando_Alonso 1981-07-29 Spanish
## 5 5 Heikki_Kovalainen 1981-10-19 Finnish
## 6 6 Kazuki_Nakajima 1985-01-11 Japanese
## 7 7 Sébastien_Bourdais 1979-02-28 French
## 8 8 Kimi_Räikkönen 1979-10-17 Finnish
## 9 9 Robert_Kubica 1984-12-07 Polish
## 10 10 Timo_Glock 1982-03-18 German
## # ℹ 847 more rows
Renaming columns
colnames(races)[5] = "race_name"
view(races)
new_circuits <- circuits %>%
rename(circuit_name = name , city = location)
view(new_circuits)
new_constructors <- constructors %>%
rename(constructors_name = name)
view(new_constructors)
Merging Data frames
merged_df <- merge(results,raw_status, by="statusId")
merged_df <- merge(merged_df,races, by="raceId")
merged_df <- merge(merged_df,drivers, by="driverId")
merged_df <- merge(merged_df,constructors, by="constructorId")
merged_df <- merge(merged_df,circuits, by="circuitId")
Renaming columns in merged data frame
new_merged_df <- merged_df %>%
rename(driver_nationality = nationality.x, constructor_nationality = nationality.y)
Driver counts of different nationalities
describe(new_merged_df)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad
## circuitId 1 25840 23.49 18.57 18.0 20.78 16.31
## constructorId 2 25840 48.63 59.73 25.0 36.50 29.65
## driverId 3 25840 261.73 268.62 163.0 222.17 195.70
## raceId 4 25840 531.43 299.44 514.0 527.91 360.27
## statusId 5 25840 17.57 26.16 11.0 10.90 13.34
## resultId 6 25840 12921.33 7460.68 12920.5 12920.95 9577.60
## number 7 25834 17.79 15.10 15.0 15.66 11.86
## grid 8 25840 11.18 7.24 11.0 10.98 8.90
## position* 9 25840 11.09 12.74 4.0 9.59 4.45
## positionText* 10 25840 25.92 13.76 31.0 27.40 10.38
## positionOrder 11 25840 12.88 7.71 12.0 12.48 8.90
## points 12 25840 1.88 4.17 0.0 0.80 0.00
## laps 13 25840 45.98 29.81 52.0 45.90 26.69
## time* 14 25840 934.69 1835.35 1.0 472.18 0.00
## milliseconds* 15 25840 965.66 1898.59 1.0 486.40 0.00
## fastestLap* 16 25840 12.08 19.75 1.0 7.84 0.00
## rank* 17 25840 4.73 7.08 1.0 2.91 0.00
## fastestLapTime* 18 25840 961.90 1839.01 1.0 508.77 0.00
## fastestLapSpeed* 19 25840 993.00 1892.78 1.0 528.47 0.00
## status* 20 25840 53.13 34.80 60.0 51.34 26.69
## year 21 25840 1990.25 19.35 1990.0 1990.90 22.24
## round 22 25840 8.39 4.96 8.0 8.16 5.93
## race_name* 23 25840 22.87 14.29 21.0 21.89 16.31
## date* 24 25840 540.92 313.10 538.0 540.75 401.78
## driver_name* 25 25840 455.69 233.76 474.0 462.41 277.25
## dob 26 25840 NaN NA NA NaN NA
## driver_nationality* 27 25840 17.32 10.24 19.0 16.62 14.83
## name.x* 28 25840 116.84 62.63 123.0 118.72 72.65
## constructor_nationality* 29 25840 9.84 5.38 6.0 9.32 4.45
## circuitRef* 30 25840 42.98 20.82 43.0 43.86 26.69
## name.y* 31 25840 34.99 22.22 29.0 34.15 23.72
## location* 32 25840 42.91 19.60 44.0 43.87 26.69
## country* 33 25840 16.63 10.16 14.0 16.24 10.38
## min max range skew kurtosis se
## circuitId 1 79 78 1.13 0.49 0.12
## constructorId 1 214 213 1.53 1.13 0.37
## driverId 1 856 855 1.13 -0.03 1.67
## raceId 1 1096 1095 0.10 -1.06 1.86
## statusId 1 141 140 2.19 3.91 0.16
## resultId 1 25845 25844 0.00 -1.20 46.41
## number 0 208 208 2.42 9.71 0.09
## grid 0 34 34 0.19 -0.93 0.05
## position* 1 34 33 0.82 -1.10 0.08
## positionText* 1 39 38 -0.75 -1.10 0.09
## positionOrder 1 39 38 0.39 -0.50 0.05
## points 0 50 50 3.15 11.78 0.03
## laps 0 200 200 0.73 3.66 0.19
## time* 1 6839 6838 1.85 2.04 11.42
## milliseconds* 1 7051 7050 1.85 2.04 11.81
## fastestLap* 1 80 79 1.51 0.84 0.12
## rank* 1 26 25 1.83 1.98 0.04
## fastestLapTime* 1 6772 6771 1.78 1.79 11.44
## fastestLapSpeed* 1 6938 6937 1.77 1.75 11.77
## status* 1 137 136 0.16 -0.51 0.22
## year 1950 2022 72 -0.19 -0.92 0.12
## round 1 22 21 0.34 -0.82 0.03
## race_name* 1 53 52 0.53 -0.77 0.09
## date* 1 1079 1078 0.01 -1.20 1.95
## driver_name* 1 855 854 -0.20 -1.04 1.45
## dob Inf -Inf -Inf NA NA NA
## driver_nationality* 1 42 41 0.42 -0.63 0.06
## name.x* 1 210 209 -0.20 -1.11 0.39
## constructor_nationality* 1 24 23 0.85 0.17 0.03
## circuitRef* 1 76 75 -0.22 -1.08 0.13
## name.y* 1 76 75 0.37 -1.17 0.14
## location* 1 74 73 -0.28 -0.85 0.12
## country* 1 34 33 0.43 -1.07 0.06
driver_country <- new_merged_df %>%
group_by(driver_nationality) %>%
summarize(driver_counts = n_distinct(driver_name)) %>%
ungroup()
Changing column name
#driver_country %>%
# rename(driver_counts = driver_name)
driver_country
## # A tibble: 42 × 2
## driver_nationality driver_counts
## <chr> <int>
## 1 American 157
## 2 American-Italian 1
## 3 Argentine 24
## 4 Argentine-Italian 1
## 5 Australian 17
## 6 Austrian 15
## 7 Belgian 23
## 8 Brazilian 32
## 9 British 165
## 10 Canadian 14
## # ℹ 32 more rows
Taking top 6 Driver countries and combining every other country in a new row named “OTHERS”
driver_country1 <- driver_country[driver_country$driver_counts >= 30, ]
driver_country1 <- driver_country1[order(-driver_country1$driver_counts), ]
new_row <- c('Others', (sum(driver_country$driver_counts) - sum(driver_country1$driver_counts)))
driver_country1 <- rbind(driver_country1, new_row)
driver_country1
## # A tibble: 7 × 2
## driver_nationality driver_counts
## <chr> <chr>
## 1 British 165
## 2 American 157
## 3 Italian 99
## 4 French 73
## 5 German 50
## 6 Brazilian 32
## 7 Others 279
3D Pie Chart for number of drivers of different countries entered in this sport
amPieChart(
data = driver_country1,
category = "driver_nationality",
value = "driver_counts",
threeD = TRUE,
variableDepth = TRUE)
## Warning: `bindFillRole()` only works on htmltools::tag() objects (e.g., div(),
## p(), etc.), not objects of type 'shiny.tag.list'.
Analysis of Drivers’ Performance
driver_analysis_df <- merged_df %>%
group_by(year, driver_name) %>%
summarise(
total_points = sum(points),
race_count = n_distinct(raceId),
avg_positionOrder = mean(positionOrder),
std_positionOrder = sd(positionOrder)
) %>%
ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
head(driver_analysis_df)
## # A tibble: 6 × 6
## year driver_name total_points race_count avg_positionOrder std_positionOrder
## <dbl> <chr> <dbl> <int> <dbl> <dbl>
## 1 1950 Alberto_Asc… 11 4 8.6 7.77
## 2 1950 Alfredo_Pián 0 1 21 NA
## 3 1950 Bayliss_Lev… 0 1 27 NA
## 4 1950 Bill_Cantre… 0 1 27 NA
## 5 1950 Bill_Holland 6 1 2 NA
## 6 1950 Bill_Schind… 0 1 26 NA
Sorting and Renaming
colnames(driver_analysis_df) <- gsub(" ", "_",
colnames(driver_analysis_df))
colnames(driver_analysis_df) <- gsub("_$", "",
colnames(driver_analysis_df))
colnames(driver_analysis_df) <- ifelse(colnames(driver_analysis_df) == "year_",
"year", colnames(driver_analysis_df))
colnames(driver_analysis_df) <- ifelse(colnames(driver_analysis_df) == "driver_name_",
"driver_name", colnames(driver_analysis_df))
Drivers with most points with every year
champion <- driver_analysis_df %>%
group_by(year, driver_name) %>%
summarise(points_sum = sum(total_points)) %>%
ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
champion
## # A tibble: 3,165 × 3
## year driver_name points_sum
## <dbl> <chr> <dbl>
## 1 1950 Alberto_Ascari 11
## 2 1950 Alfredo_Pián 0
## 3 1950 Bayliss_Levrett 0
## 4 1950 Bill_Cantrell 0
## 5 1950 Bill_Holland 6
## 6 1950 Bill_Schindler 0
## 7 1950 Bob_Gerard 0
## 8 1950 Brian_Shawe Taylor 0
## 9 1950 Cecil_Green 3
## 10 1950 Charles_Pozzi 0
## # ℹ 3,155 more rows
Top 20 drivers year wise in descending order
champion %>%
arrange(desc(year)) %>%
head(champion,n=20)
## # A tibble: 20 × 3
## year driver_name points_sum
## <dbl> <chr> <dbl>
## 1 2022 Alexander_Albon 4
## 2 2022 Carlos_Sainz 228
## 3 2022 Charles_Leclerc 291
## 4 2022 Daniel_Ricciardo 34
## 5 2022 Esteban_Ocon 89
## 6 2022 Fernando_Alonso 81
## 7 2022 George_Russell 262
## 8 2022 Guanyu_Zhou 6
## 9 2022 Kevin_Magnussen 21
## 10 2022 Lance_Stroll 18
## 11 2022 Lando_Norris 116
## 12 2022 Lewis_Hamilton 233
## 13 2022 Max_Verstappen 433
## 14 2022 Mick_Schumacher 12
## 15 2022 Nicholas_Latifi 2
## 16 2022 Nico_Hülkenberg 0
## 17 2022 Nyck_de Vries 2
## 18 2022 Pierre_Gasly 23
## 19 2022 Sebastian_Vettel 37
## 20 2022 Sergio_Pérez 291
Top 10 drivers with most F1 races
most_races <- merged_df %>%
group_by(driver_name) %>%
summarise(total_races = n_distinct(raceId)) %>%
arrange(desc(total_races)) %>%
head(10) %>%
rename(total_races = total_races)
most_races
## # A tibble: 10 × 2
## driver_name total_races
## <chr> <int>
## 1 Fernando_Alonso 358
## 2 Kimi_Räikkönen 352
## 3 Rubens_Barrichello 326
## 4 Lewis_Hamilton 310
## 5 Jenson_Button 309
## 6 Michael_Schumacher 308
## 7 Sebastian_Vettel 300
## 8 Felipe_Massa 271
## 9 Riccardo_Patrese 257
## 10 Jarno_Trulli 256
Plotting graph for top 10 drivers with most F1 races
ggplot(most_races, aes(x = total_races, y = driver_name, fill = driver_name)) +
geom_bar(stat = "identity", width = 0.5) +
labs(title = "Top 10 Drivers with most race entries in Formula 1") +
coord_flip() +
scale_x_continuous(expand = c(0, 0)) +
theme(plot.title = element_text(size = 16),
axis.text.y = element_text(size = 12),
axis.text.x = element_text(size = 4),
axis.title.y = element_blank(),
axis.title.x = element_text(size = 14)) +
scale_fill_discrete(guide = FALSE)
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## ℹ Please use "none" instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Most number of laps done by the drivers in their career
most_laps <- merged_df %>%
group_by(driver_name) %>%
summarise(total_laps = sum(laps)) %>%
arrange(desc(total_laps)) %>%
head(10)
most_laps
## # A tibble: 10 × 2
## driver_name total_laps
## <chr> <dbl>
## 1 Fernando_Alonso 19153
## 2 Kimi_Räikkönen 18618
## 3 Lewis_Hamilton 17681
## 4 Michael_Schumacher 16824
## 5 Rubens_Barrichello 16642
## 6 Sebastian_Vettel 16426
## 7 Jenson_Button 16274
## 8 Felipe_Massa 14852
## 9 Sergio_Pérez 13012
## 10 Daniel_Ricciardo 12716
Graph for most number of laps done by drivers
ggplot(most_laps, aes(x = total_laps, y = driver_name, fill = driver_name)) +
geom_bar(stat = "identity", width = 0.5) +
labs(title = "Top 10 Drivers with most laps in Formula 1") +
theme_minimal() +
coord_flip() +
scale_x_continuous(expand = c(0, 0)) +
theme(plot.title = element_text(size = 16),
axis.text.y = element_text(size = 12),
axis.text.x = element_text(size = 3.8),
axis.title.y = element_blank(),
axis.title.x = element_text(size = 14))
Compare Drivers Average points and deviation
driver_analysis_df2 <- merged_df %>%
group_by(driver_name) %>%
summarise(raceId_count = n(),
positionOrder_mean = mean(positionOrder),
positionOrder_std = sd(positionOrder)) %>%
ungroup() %>%
filter(raceId_count > 100) %>%
arrange(positionOrder_mean) %>%
head(10)
driver_analysis_df2
## # A tibble: 10 × 4
## driver_name raceId_count positionOrder_mean positionOrder_std
## <chr> <int> <dbl> <dbl>
## 1 Lewis_Hamilton 310 4.79 5.64
## 2 Max_Verstappen 163 6.53 6.32
## 3 Michael_Schumacher 308 6.88 7.07
## 4 Sebastian_Vettel 300 7.09 6.41
## 5 Alain_Prost 202 7.50 7.83
## 6 Valtteri_Bottas 201 7.60 5.66
## 7 Charles_Leclerc 103 8.12 6.01
## 8 Denny_Hulme 112 8.21 6.30
## 9 Nico_Rosberg 206 8.25 6.25
## 10 Ayrton_Senna 162 8.26 8.28
Graph for average points finished vs SD of finishd position
ggplot(driver_analysis_df2, aes(x = positionOrder_mean, y = positionOrder_std, color = driver_name)) +
geom_point(size = 4) +
labs(title = 'Average position finished vs Standard Deviation of position finished') +
theme_minimal() +
scale_color_discrete(name = "Driver Name") +
xlab("Average Position Finished") +
ylab("Standard Deviation of Position Finished") +
theme(plot.title = element_text(size = 13),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12))
Constructor Analysis
Converting “\N” values to NA
merged_df<-merged_df %>%
mutate(across(where(is.character), ~na_if(.,"\\N")))
Obtaining number of constructors won by countries
constructor_country <- new_merged_df %>%
group_by(constructor_nationality) %>%
summarise(constructor_counts = n_distinct(name.x)) %>%
ungroup()
constructor_country
## # A tibble: 24 × 2
## constructor_nationality constructor_counts
## <chr> <int>
## 1 American 38
## 2 Australian 1
## 3 Austrian 1
## 4 Belgian 1
## 5 Brazilian 1
## 6 British 86
## 7 Canadian 2
## 8 Dutch 3
## 9 East German 1
## 10 French 13
## # ℹ 14 more rows
Countries that won more than 9 constructors
constructor_country1 <- constructor_country %>%
filter(constructor_counts >= 9) %>%
arrange(desc(constructor_counts))
constructor_country1
## # A tibble: 5 × 2
## constructor_nationality constructor_counts
## <chr> <int>
## 1 British 86
## 2 American 38
## 3 Italian 29
## 4 French 13
## 5 German 10
Graph of countries winning more tha 9 constructors
amPieChart(
data = constructor_country1,
category = "constructor_nationality",
value = "constructor_counts",
threeD = TRUE,
variableDepth = TRUE)
## Warning: `bindFillRole()` only works on htmltools::tag() objects (e.g., div(),
## p(), etc.), not objects of type 'shiny.tag.list'.
A vector with year, team names with total points, race count, mean position order in constructor and standard deviation of position order
con_analysis_df <- new_merged_df %>%
group_by(year, name.x) %>%
summarise(total_points = sum(points),
race_count = n_distinct(raceId),
positionOrder_mean = mean(positionOrder),
positionOrder_std = sd(positionOrder)) %>%
ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
con_analysis_df
## # A tibble: 1,091 × 6
## year name.x total_points race_count positionOrder_mean positionOrder_std
## <dbl> <chr> <dbl> <int> <dbl> <dbl>
## 1 1950 Adams 0 1 27 0
## 2 1950 Alfa Romeo 89 6 6.45 6.30
## 3 1950 Alta 0 2 12.7 3.51
## 4 1950 Cooper 0 1 19 NA
## 5 1950 Deidt 10 1 12 16.5
## 6 1950 ERA 0 3 13.2 6.84
## 7 1950 Ewing 0 1 17 NA
## 8 1950 Ferrari 21 5 9.47 6.81
## 9 1950 Kurtis Kr… 14 1 15.8 10.6
## 10 1950 Langley 0 1 16 NA
## # ℹ 1,081 more rows
Renaming the name column in table
#con_analysis_df <- con_analysis_df %>%
# rename( constructor_name = name.x)
#con_analysis_df
Top 20 Constructor Champions
#result <- con_champion_df %>%
# arrange(desc(year)) %>%
# select(year,constructor_name) %>%
# head(result,n=20)
#result
Getting race entries by constructors need con_name raceID
grouping con_name every race with 2 entries for 2 cars with team name
avg position and sd position for graph same as driver points
finishes
con_analysis_df2 <- new_merged_df %>%
group_by(name.x, raceId) %>%
summarize(positionOrder_mean = mean(positionOrder),
positionOrder_std = sd(positionOrder)) %>%
ungroup()
## `summarise()` has grouped output by 'name.x'. You can override using the
## `.groups` argument.
con_analysis_df2
## # A tibble: 12,568 × 4
## name.x raceId positionOrder_mean positionOrder_std
## <chr> <dbl> <dbl> <dbl>
## 1 AFM 814 29.7 3.06
## 2 AFM 816 14 NA
## 3 AFM 817 18 NA
## 4 AFM 822 11.5 3.54
## 5 AGS 305 18 14.1
## 6 AGS 306 26.5 2.12
## 7 AGS 307 27.5 0.707
## 8 AGS 308 26 2.83
## 9 AGS 309 27.5 0.707
## 10 AGS 310 29 1.41
## # ℹ 12,558 more rows
New data frame for same
con_analysis_df3 <- con_analysis_df2 %>%
group_by(name.x) %>%
summarize(constructor_entry = n(),
positionOrder_mean = mean(positionOrder_mean),
positionOrder_std = mean(positionOrder_std)) %>%
ungroup()
con_analysis_df3
## # A tibble: 210 × 4
## name.x constructor_entry positionOrder_mean positionOrder_std
## <chr> <int> <dbl> <dbl>
## 1 AFM 4 18.3 NA
## 2 AGS 78 23.5 NA
## 3 ATS 116 18.0 NA
## 4 Adams 1 27 0
## 5 Alfa Romeo 194 13.8 NA
## 6 AlphaTauri 61 11.9 4.44
## 7 Alpine F1 Team 44 10.1 3.39
## 8 Alta 5 13.3 NA
## 9 Amon 4 27.4 NA
## 10 Andrea Moda 8 30.2 NA
## # ℹ 200 more rows
Top 20 teams with most race entries
top_constructors <- con_analysis_df3 %>%
arrange(desc(constructor_entry)) %>%
head(20)
top_constructors
## # A tibble: 20 × 4
## name.x constructor_entry positionOrder_mean positionOrder_std
## <chr> <int> <dbl> <dbl>
## 1 Ferrari 1054 8.34 NA
## 2 McLaren 883 9.48 NA
## 3 Williams 797 10.9 NA
## 4 Tyrrell 433 13.2 NA
## 5 Renault 403 11.3 NA
## 6 Sauber 395 12.9 NA
## 7 Team Lotus 395 13.1 NA
## 8 Red Bull 348 7.64 4.21
## 9 Minardi 345 15.9 NA
## 10 Ligier 330 13.4 NA
## 11 Brabham 308 13.9 NA
## 12 Arrows 296 14.3 NA
## 13 Mercedes 271 5.76 3.57
## 14 Toro Rosso 268 13.5 3.45
## 15 Benetton 260 10.2 5.84
## 16 Jordan 250 13.1 4.75
## 17 Force India 212 11.7 3.33
## 18 BRM 208 13.1 NA
## 19 March 196 16.4 NA
## 20 Alfa Romeo 194 13.8 NA
ggplot(top_constructors, aes(y = name.x, x = constructor_entry, fill = name.x)) +
geom_bar(stat = "identity", width = 0.5) +
labs(title = "Top 20 Constructors with most number of race entries") +
theme_minimal() +
coord_flip() +
theme(plot.title = element_text(size = 13),
axis.title = element_text(size = 14),
axis.text = element_text(size = 4))
Filtering and sorting top performing teams in constructors with average position finished and standard deviation of position finished
con_analysis_df3 <- con_analysis_df3[con_analysis_df3$constructor_entry > 200, ]
con_analysis_df3 <- con_analysis_df3 %>%
arrange(positionOrder_mean) %>%
head(10)
ggplot(con_analysis_df3, aes(x = positionOrder_mean, y = positionOrder_std, color = name.x)) +
geom_point(size = 5) +
labs(title = "Constructor's Average Position Finished vs Standard Deviation of Position Finished") +
theme_minimal() +
guides(color = guide_legend(title = "Constructors Name"))
## Warning: Removed 6 rows containing missing values (`geom_point()`).
CIRCUIT ANALYSIS
Making new data frames extracting data from the dataset
fastestlap <- merge(races, circuits, by = 'circuitId')
fastestlap <- merge(fastestlap, results, by = 'raceId')
Replacing “\N” values with NA
fastestlap<-fastestlap %>%
mutate(across(where(is.character), ~na_if(.,"\\N")))
Top 20 circuits with most number of races hosted
most_circuits <- fastestlap %>%
group_by(year, name) %>%
summarize(count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
m <- most_circuits %>%
count(name) %>%
arrange(desc(n)) %>%
head(20) %>%
rename(count = n, circuit_name = name)
m
## # A tibble: 20 × 2
## circuit_name count
## <chr> <int>
## 1 Autodromo Nazionale di Monza 72
## 2 Circuit de Monaco 68
## 3 Silverstone Circuit 56
## 4 Circuit de Spa-Francorchamps 55
## 5 Circuit Gilles Villeneuve 41
## 6 Nürburgring 41
## 7 Autódromo José Carlos Pace 39
## 8 Hockenheimring 37
## 9 Hungaroring 37
## 10 Red Bull Ring 34
## 11 Circuit Park Zandvoort 32
## 12 Circuit de Barcelona-Catalunya 32
## 13 Suzuka Circuit 32
## 14 Autodromo Enzo e Dino Ferrari 30
## 15 Albert Park Grand Prix Circuit 25
## 16 Autódromo Hermanos Rodríguez 22
## 17 Autódromo Juan y Oscar Gálvez 20
## 18 Kyalami 20
## 19 Watkins Glen 20
## 20 Indianapolis Motor Speedway 19
BarPlot
ggplot(m, aes(y = circuit_name, x = count), fill = circuit_name) +
geom_bar(stat = "identity", width = 0.5) +
labs(title = "Top 20 Circuits with most number of races hosted") +
theme_minimal()
Races hosted from 2004 to 2022
fastestlap <- fastestlap %>%
filter(year >= 2004, year <= 2022)
host <- fastestlap %>%
group_by(year, name) %>%
summarize(count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
host
## # A tibble: 362 × 3
## year name count
## <dbl> <chr> <int>
## 1 2004 Albert Park Grand Prix Circuit 20
## 2 2004 Autodromo Enzo e Dino Ferrari 20
## 3 2004 Autodromo Nazionale di Monza 20
## 4 2004 Autódromo José Carlos Pace 20
## 5 2004 Bahrain International Circuit 20
## 6 2004 Circuit Gilles Villeneuve 20
## 7 2004 Circuit de Barcelona-Catalunya 20
## 8 2004 Circuit de Monaco 20
## 9 2004 Circuit de Nevers Magny-Cours 20
## 10 2004 Circuit de Spa-Francorchamps 20
## # ℹ 352 more rows
circuit_name_counts <- table(host$name)
circuit_name_counts
##
## Albert Park Grand Prix Circuit Autodromo Enzo e Dino Ferrari
## 17 6
## Autódromo Hermanos Rodríguez Autódromo Internacional do Algarve
## 7 2
## Autodromo Internazionale del Mugello Autódromo José Carlos Pace
## 1 18
## Autodromo Nazionale di Monza Bahrain International Circuit
## 19 18
## Baku City Circuit Buddh International Circuit
## 6 3
## Circuit de Barcelona-Catalunya Circuit de Monaco
## 19 18
## Circuit de Nevers Magny-Cours Circuit de Spa-Francorchamps
## 5 18
## Circuit Gilles Villeneuve Circuit of the Americas
## 16 10
## Circuit Park Zandvoort Circuit Paul Ricard
## 2 4
## Fuji Speedway Hockenheimring
## 2 10
## Hungaroring Indianapolis Motor Speedway
## 19 4
## Istanbul Park Jeddah Corniche Circuit
## 9 2
## Korean International Circuit Losail International Circuit
## 4 1
## Marina Bay Street Circuit Miami International Autodrome
## 13 1
## Nürburgring Red Bull Ring
## 8 9
## Sepang International Circuit Shanghai International Circuit
## 14 16
## Silverstone Circuit Sochi Autodrom
## 19 8
## Suzuka Circuit Valencia Street Circuit
## 15 5
## Yas Marina Circuit
## 14
Top 6 popular circuits from 2004
# Autodromo Nazionale di Monza
lap_time_monza <- fastestlap[fastestlap$name == 'Autodromo Nazionale di Monza', ]
# Circuit de Monaco
lap_time_monaco <- fastestlap[fastestlap$name == 'Circuit de Monaco', ]
# Silverstone Circuit
lap_time_silverstone <- fastestlap[fastestlap$name == 'Silverstone Circuit', ]
# Circuit de Barcelona-Catalunya
lap_time_catalunya <- fastestlap[fastestlap$name == 'Circuit de Barcelona-Catalunya', ]
# Hungaroring
lap_time_hungaroring <- fastestlap[fastestlap$name == 'Hungaroring', ]
# Circuit de Spa-Francorchamps
lap_time_spa <- fastestlap[fastestlap$name == 'Circuit de Spa-Francorchamps', ]
Grouping by year and calculating minimum fastest lap of tracks
# Silverstone
lap <- lap_time_silverstone %>%
group_by(year) %>%
summarize(minfastestlap = min(fastestLapTime)) %>%
ungroup()
lap
## # A tibble: 19 × 2
## year minfastestlap
## <dbl> <chr>
## 1 2004 1:18.739
## 2 2005 1:20.502
## 3 2006 <NA>
## 4 2007 1:20.638
## 5 2008 <NA>
## 6 2009 1:20.735
## 7 2010 1:30.874
## 8 2011 1:34.908
## 9 2012 <NA>
## 10 2013 1:33.401
## 11 2014 <NA>
## 12 2015 <NA>
## 13 2016 1:35.548
## 14 2017 <NA>
## 15 2018 <NA>
## 16 2019 1:27.369
## 17 2020 <NA>
## 18 2021 <NA>
## 19 2022 <NA>
# Monaco
lap1 <- lap_time_monaco %>%
group_by(year) %>%
summarize(minfastestlap = min(fastestLapTime)) %>%
ungroup()
lap1
## # A tibble: 18 × 2
## year minfastestlap
## <dbl> <chr>
## 1 2004 <NA>
## 2 2005 1:15.842
## 3 2006 1:15.143
## 4 2007 <NA>
## 5 2008 1:16.689
## 6 2009 1:15.154
## 7 2010 <NA>
## 8 2011 <NA>
## 9 2012 <NA>
## 10 2013 1:16.577
## 11 2014 <NA>
## 12 2015 1:18.063
## 13 2016 1:17.939
## 14 2017 1:14.820
## 15 2018 1:14.260
## 16 2019 1:14.279
## 17 2021 <NA>
## 18 2022 1:14.693
# Monza
lap2 <- lap_time_monza %>%
group_by(year) %>%
summarize(minfastestlap = min(fastestLapTime)) %>%
ungroup()
lap2
## # A tibble: 19 × 2
## year minfastestlap
## <dbl> <chr>
## 1 2004 <NA>
## 2 2005 1:21.504
## 3 2006 1:22.559
## 4 2007 <NA>
## 5 2008 1:28.047
## 6 2009 <NA>
## 7 2010 <NA>
## 8 2011 <NA>
## 9 2012 1:27.239
## 10 2013 <NA>
## 11 2014 1:28.004
## 12 2015 <NA>
## 13 2016 <NA>
## 14 2017 1:23.361
## 15 2018 <NA>
## 16 2019 1:21.779
## 17 2020 1:22.746
## 18 2021 <NA>
## 19 2022 1:24.030
# Catalunya
lap3 <- lap_time_catalunya %>%
group_by(year) %>%
summarize(minfastestlap = min(fastestLapTime)) %>%
ungroup()
lap3
## # A tibble: 19 × 2
## year minfastestlap
## <dbl> <chr>
## 1 2004 1:17.450
## 2 2005 1:15.641
## 3 2006 1:16.648
## 4 2007 <NA>
## 5 2008 <NA>
## 6 2009 <NA>
## 7 2010 <NA>
## 8 2011 1:26.727
## 9 2012 1:26.250
## 10 2013 1:26.217
## 11 2014 1:28.918
## 12 2015 1:28.270
## 13 2016 <NA>
## 14 2017 <NA>
## 15 2018 <NA>
## 16 2019 1:18.492
## 17 2020 1:18.183
## 18 2021 1:18.149
## 19 2022 1:24.108
# Hungaroring
lap4 <- lap_time_hungaroring %>%
group_by(year) %>%
summarize(minfastestlap = min(fastestLapTime)) %>%
ungroup()
lap4
## # A tibble: 19 × 2
## year minfastestlap
## <dbl> <chr>
## 1 2004 1:19.071
## 2 2005 <NA>
## 3 2006 <NA>
## 4 2007 1:20.047
## 5 2008 1:21.195
## 6 2009 <NA>
## 7 2010 <NA>
## 8 2011 1:23.415
## 9 2012 1:24.136
## 10 2013 1:24.069
## 11 2014 1:25.724
## 12 2015 1:24.821
## 13 2016 1:23.086
## 14 2017 <NA>
## 15 2018 <NA>
## 16 2019 1:17.103
## 17 2020 1:16.627
## 18 2021 <NA>
## 19 2022 1:21.386
# Spa
lap5 <- lap_time_spa %>%
group_by(year) %>%
summarize(minfastestlap = min(fastestLapTime)) %>%
ungroup()
lap5
## # A tibble: 18 × 2
## year minfastestlap
## <dbl> <chr>
## 1 2004 <NA>
## 2 2005 1:51.453
## 3 2007 <NA>
## 4 2008 1:47.930
## 5 2009 <NA>
## 6 2010 <NA>
## 7 2011 <NA>
## 8 2012 <NA>
## 9 2013 1:50.756
## 10 2014 <NA>
## 11 2015 <NA>
## 12 2016 <NA>
## 13 2017 <NA>
## 14 2018 <NA>
## 15 2019 <NA>
## 16 2020 <NA>
## 17 2021 <NA>
## 18 2022 <NA>
Plotting line graph against for tracks against best time over the years
ggplot() +
geom_line(data = lap, aes(x = year, y = minfastestlap), color = "blue", group = 1) +
geom_line(data = lap1, aes(x = year, y = minfastestlap), color = "orange", group = 1) +
geom_line(data = lap2, aes(x = year, y = minfastestlap), color = "green", group = 1) +
geom_line(data = lap3, aes(x = year, y = minfastestlap), color = "red", group = 1) +
geom_line(data = lap4, aes(x = year, y = minfastestlap), color = "purple", group = 1) +
geom_line(data = lap5, aes(x = year, y = minfastestlap), color = "pink", group = 1) +
scale_x_continuous(breaks = seq(min(lap$year), max(lap$year), by = 1)) +
labs(y = "Fastest lap Time", x = "Year", title = "Lap time over the years") +
theme_minimal() +
scale_color_manual(values = c("Silverstone"="blue", "Monaco"="orange","Monza"= "green","Catalunya"= "red","Hungaroring"= "purple","Spa"= "pink")) +
guides(color = guide_legend(title = "Circuits")) +
theme(legend.position = "right")
## Warning: Removed 10 rows containing missing values (`geom_line()`).
## Warning: Removed 7 rows containing missing values (`geom_line()`).
## Warning: Removed 10 rows containing missing values (`geom_line()`).
## Warning: Removed 7 rows containing missing values (`geom_line()`).
## Removed 7 rows containing missing values (`geom_line()`).
## Warning: Removed 15 rows containing missing values (`geom_line()`).