mydataf1 <- read.csv("F1Drivers_Dataset.csv", header = TRUE, sep = ",", dec = ".")
The data set contains information about 868 Formula 1 drivers.
head(mydataf1)
## Driver Nationality Seasons Championships
## 1 Carlo Abate Italy [1962, 1963] 0
## 2 George Abecassis United Kingdom [1951, 1952] 0
## 3 Kenny Acheson United Kingdom [1983, 1985] 0
## 4 Andrea de Adamich Italy [1968, 1970, 1971, 1972, 1973] 0
## 5 Philippe Adams Belgium [1994] 0
## 6 Walt Ader United States [1950] 0
## Race_Entries Race_Starts Pole_Positions Race_Wins Podiums Fastest_Laps Points
## 1 3 0 0 0 0 0 0
## 2 2 2 0 0 0 0 0
## 3 10 3 0 0 0 0 0
## 4 36 30 0 0 0 0 6
## 5 2 2 0 0 0 0 0
## 6 1 1 0 0 0 0 0
## Active Championship.Years Decade Pole_Rate Start_Rate Win_Rate Podium_Rate
## 1 False 1960 0 0.0000000 0 0
## 2 False 1950 0 1.0000000 0 0
## 3 False 1980 0 0.3000000 0 0
## 4 False 1970 0 0.8333333 0 0
## 5 False 1990 0 1.0000000 0 0
## 6 False 1950 0 1.0000000 0 0
## FastLap_Rate Points_Per_Entry Years_Active Champion
## 1 0 0.0000000 2 False
## 2 0 0.0000000 2 False
## 3 0 0.0000000 2 False
## 4 0 0.1666667 5 False
## 5 0 0.0000000 1 False
## 6 0 0.0000000 1 False
summary(mydataf1)
## Driver Nationality Seasons Championships
## Length:868 Length:868 Length:868 Min. :0.0000
## Class :character Class :character Class :character 1st Qu.:0.0000
## Mode :character Mode :character Mode :character Median :0.0000
## Mean :0.0841
## 3rd Qu.:0.0000
## Max. :7.0000
## Race_Entries Race_Starts Pole_Positions Race_Wins
## Min. : 1.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 2.00 1st Qu.: 1.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 7.00 Median : 5.00 Median : 0.000 Median : 0.000
## Mean : 29.92 Mean : 27.69 Mean : 1.244 Mean : 1.248
## 3rd Qu.: 29.25 3rd Qu.: 26.00 3rd Qu.: 0.000 3rd Qu.: 0.000
## Max. :359.00 Max. :356.00 Max. :103.000 Max. :103.000
## Podiums Fastest_Laps Points Active
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Length:868
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.00 Class :character
## Median : 0.000 Median : 0.000 Median : 0.00 Mode :character
## Mean : 3.757 Mean : 1.262 Mean : 55.85
## 3rd Qu.: 0.000 3rd Qu.: 0.000 3rd Qu.: 8.00
## Max. :191.000 Max. :77.000 Max. :4415.50
## Championship.Years Decade Pole_Rate Start_Rate
## Length:868 Min. :1950 Min. :0.00000 Min. :0.0000
## Class :character 1st Qu.:1960 1st Qu.:0.00000 1st Qu.:0.6667
## Mode :character Median :1970 Median :0.00000 Median :0.9623
## Mean :1972 Mean :0.01147 Mean :0.7798
## 3rd Qu.:1982 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :2020 Max. :0.55769 Max. :1.0000
## Win_Rate Podium_Rate FastLap_Rate Points_Per_Entry
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. : 0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.: 0.0000
## Median :0.00000 Median :0.00000 Median :0.00000 Median : 0.0000
## Mean :0.01105 Mean :0.04139 Mean :0.01189 Mean : 0.4792
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.: 0.3825
## Max. :0.46154 Max. :1.00000 Max. :0.50000 Max. :14.1977
## Years_Active Champion
## Min. : 1.000 Length:868
## 1st Qu.: 1.000 Class :character
## Median : 2.000 Mode :character
## Mean : 3.665
## 3rd Qu.: 5.000
## Max. :19.000
mydataf1[mydataf1 == ""] <- NA
names(mydataf1)[names(mydataf1) == "Championship.Years"] <- "Championship_Years"
nrow(mydataf1)
## [1] 868
ncol(mydataf1)
## [1] 22
colSums(is.na(mydataf1))
## Driver Nationality Seasons Championships
## 0 0 0 0
## Race_Entries Race_Starts Pole_Positions Race_Wins
## 0 0 0 0
## Podiums Fastest_Laps Points Active
## 0 0 0 0
## Championship_Years Decade Pole_Rate Start_Rate
## 834 0 0 0
## Win_Rate Podium_Rate FastLap_Rate Points_Per_Entry
## 0 0 0 0
## Years_Active Champion
## 0 0
mydataf1$Driver <- factor(mydataf1$Driver)
mydataf1$Nationality <- factor(mydataf1$Nationality)
mydataf1$Championship_Years <- factor(mydataf1$Championship_Years)
mydataf1$Decade <- factor(mydataf1$Decade)
mydataf1$Champion <- factor(mydataf1$Champion)
sum(is.na(mydataf1$Championship_Years))
## [1] 834
mydata_champ <- mydataf1[!is.na(mydataf1$Championship_Years), ]
mydata_champ$Win_Pct <- mydata_champ$Win_Rate * 100
head(mydata_champ$Win_Pct)
## [1] 8.913649 9.160305 39.393939 10.937500 4.854369 34.246575
after2000 <- mydata_champ[mydata_champ$Decade %in% c("2000", "2010", "2020"), ]
after2000[, c("Driver","Nationality","Champion","Championships","Championship_Years")]
## Driver Nationality Champion Championships
## 18 Fernando Alonso Spain True 2
## 136 Jenson Button United Kingdom True 1
## 335 Mika Häkkinen Finland True 2
## 339 Lewis Hamilton United Kingdom True 7
## 364 Damon Hill United Kingdom True 1
## 636 Kimi Räikkönen Finland True 1
## 671 Nico Rosberg Germany True 1
## 711 Michael Schumacher Germany True 7
## 812 Max Verstappen Netherlands True 2
## 813 Sebastian Vettel Germany True 4
## 815 Jacques Villeneuve Canada True 1
## Championship_Years
## 18 [2005, 2006]
## 136 [2009]
## 335 [1998, 1999]
## 339 [2008, 2014, 2015, 2017, 2018, 2019, 2020]
## 364 [1996]
## 636 [2007]
## 671 [2016]
## 711 [1994, 1995, 2000, 2001, 2002, 2003, 2004]
## 812 [2021, 2022]
## 813 [2010, 2011, 2012, 2013]
## 815 [1997]
germans <- mydata_champ[mydata_champ$Nationality == "Germany", ]
germans[, c("Driver","Nationality","Champion","Championships", "Championship_Years")]
## Driver Nationality Champion Championships
## 671 Nico Rosberg Germany True 1
## 711 Michael Schumacher Germany True 7
## 813 Sebastian Vettel Germany True 4
## Championship_Years
## 671 [2016]
## 711 [1994, 1995, 2000, 2001, 2002, 2003, 2004]
## 813 [2010, 2011, 2012, 2013]
mydata_champ$Experience <- ifelse(mydata_champ$Years_Active > 10,
"Veteran", "Rookie/Normal")
table(mydata_champ$Experience)
##
## Rookie/Normal Veteran
## 15 19
summary(mydata_champ$Podium_Rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1328 0.2367 0.2999 0.3375 0.4363 0.6731
summary(mydata_champ$Win_Rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03906 0.08078 0.11024 0.15555 0.20972 0.46154
summary(mydata_champ$Years_Active)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.00 8.25 11.00 11.74 14.75 19.00
Podium rate:
Win_Rate:
Years_Active:
library(ggplot2)
ggplot(mydata_champ, aes(x = Podium_Rate, y = Win_Rate)) +
geom_point(color = "darkred") +
geom_smooth(method = "lm", se = FALSE, color = "darkmagenta") +
labs(title = "Podium Rate vs Win Rate",
x = "Podium Rate",
y = "Win Rate")
## `geom_smooth()` using formula = 'y ~ x'
hist(mydata_champ$Win_Pct,
main = "Distribution of Win % (Champions)",
xlab = "Win Percentage",
col = "brown", border = "white",
breaks = seq(0, 50, by = 10))
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
nat_counts <- mydata_champ %>%
group_by(Nationality) %>%
tally() %>%
filter(n >= 3)
mydata_filtered <- mydata_champ %>%
filter(Nationality %in% nat_counts$Nationality)
ggplot(mydata_filtered, aes(x = Nationality, y = Points)) +
geom_boxplot(fill = "coral") +
labs(title = "Points by Nationality", x = "Nationality", y = "Points") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(readxl)
mydata_mba <- read_excel("Business School.xlsx")
head(mydata_mba)
## # A tibble: 6 × 9
## `Student ID` `Undergrad Degree` `Undergrad Grade` `MBA Grade`
## <dbl> <chr> <dbl> <dbl>
## 1 1 Business 68.4 90.2
## 2 2 Computer Science 70.2 68.7
## 3 3 Finance 76.4 83.3
## 4 4 Business 82.6 88.7
## 5 5 Finance 76.9 75.4
## 6 6 Computer Science 83.3 82.1
## # ℹ 5 more variables: `Work Experience` <chr>, `Employability (Before)` <dbl>,
## # `Employability (After)` <dbl>, Status <chr>, `Annual Salary` <dbl>
summary(mydata_mba)
## Student ID Undergrad Degree Undergrad Grade MBA Grade
## Min. : 1.00 Length:100 Min. : 61.20 Min. :58.14
## 1st Qu.: 25.75 Class :character 1st Qu.: 71.47 1st Qu.:71.14
## Median : 50.50 Mode :character Median : 76.65 Median :76.38
## Mean : 50.50 Mean : 76.90 Mean :76.04
## 3rd Qu.: 75.25 3rd Qu.: 81.70 3rd Qu.:82.15
## Max. :100.00 Max. :100.00 Max. :95.00
## Work Experience Employability (Before) Employability (After)
## Length:100 Min. :101.0 Min. :119.0
## Class :character 1st Qu.:245.8 1st Qu.:312.0
## Mode :character Median :256.8 Median :435.6
## Mean :257.9 Mean :422.7
## 3rd Qu.:261.0 3rd Qu.:529.0
## Max. :421.0 Max. :631.0
## Status Annual Salary
## Length:100 Min. : 20000
## Class :character 1st Qu.: 87125
## Mode :character Median :103500
## Mean :109058
## 3rd Qu.:124000
## Max. :340000
nrow(mydata_mba)
## [1] 100
ncol(mydata_mba)
## [1] 9
names(mydata_mba)[names(mydata_mba) == "Undergrad Degree"] <- "Undergrad_Degree"
library(ggplot2)
ggplot(mydata_mba, aes(x = Undergrad_Degree)) +
geom_bar(fill = "darkred", color = "white") +
labs (title = "Distribution of Undergraduate Degrees",
x = "Undergraduate Degree")
names(mydata_mba)[names(mydata_mba) == "Annual Salary"] <- "Annual_Salary"
summary(mydata_mba$"Annual_Salary")
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20000 87125 103500 109058 124000 340000
ggplot(mydata_mba, aes(x = Annual_Salary)) +
geom_histogram(binwidth = 20000, fill = "magenta", color = "darkorchid") +
scale_x_continuous(limits = c(0, 200000)) +
labs(title = "Distribution of Annual Salary",
x = "Annual Salary")
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
names(mydata_mba)[names(mydata_mba) == "MBA Grade"] <- "MBA_Grade"
t.test(mydata_mba$MBA_Grade, mu = 74)
##
## One Sample t-test
##
## data: mydata_mba$MBA_Grade
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
mean_grade <- mean(mydata_mba$MBA_Grade)
sd_grade <- sd(mydata_mba$MBA_Grade)
cohens_d <- (mean_grade - 74) / sd_grade
cohens_d
## [1] 0.2658658
library(readxl)
apts <- read_excel("Apartments.xlsx")
apts$ID <- 1:nrow(apts)
head(apts)
## # A tibble: 6 × 6
## Age Distance Price Parking Balcony ID
## <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 7 28 1640 0 1 1
## 2 18 1 2800 1 0 2
## 3 7 28 1660 0 0 3
## 4 28 29 1850 0 1 4
## 5 18 18 1640 1 1 5
## 6 28 12 1770 0 1 6
summary(apts)
## Age Distance Price Parking
## Min. : 1.00 Min. : 1.00 Min. :1400 Min. :0.0000
## 1st Qu.:12.00 1st Qu.: 4.00 1st Qu.:1710 1st Qu.:0.0000
## Median :18.00 Median :12.00 Median :1950 Median :1.0000
## Mean :18.55 Mean :14.22 Mean :2019 Mean :0.5059
## 3rd Qu.:24.00 3rd Qu.:20.00 3rd Qu.:2290 3rd Qu.:1.0000
## Max. :45.00 Max. :45.00 Max. :2820 Max. :1.0000
## Balcony ID
## Min. :0.0000 Min. : 1
## 1st Qu.:0.0000 1st Qu.:22
## Median :0.0000 Median :43
## Mean :0.4353 Mean :43
## 3rd Qu.:1.0000 3rd Qu.:64
## Max. :1.0000 Max. :85
nrow(apts)
## [1] 85
ncol(apts)
## [1] 6
Description:
apts$Parking <- factor(apts$Parking,
levels = c(0,1),
labels = c("No","Yes"))
apts$Balcony <- factor(apts$Balcony,
levels = c(0,1),
labels = c("No","Yes"))
t.test(apts$Price, mu = 1900, alternative = "two.sided")
##
## One Sample t-test
##
## data: apts$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
fit1 <- lm(Price ~ Age, data = apts)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = apts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
cor(apts$Age, apts$Price, use = "complete.obs")
## [1] -0.230255
library(GGally)
ggpairs(apts[, c("Price", "Age", "Distance")],
upper = list(continuous = wrap("cor", size = 4)),
lower = list(continuous = wrap("points", alpha = 0.6, size = 1.5, color = "darkred")),
diag = list(continuous = wrap("densityDiag", alpha = 0.5, fill = "coral"))) +
theme_minimal()
fit2 <- lm(Price ~ Age + Distance, data = apts)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif(fit2)
## Age Distance
## 1.001845 1.001845
apts$StdResid <- rstandard(fit2)
apts$CooksD <- cooks.distance(fit2)
bad_idx <- which(abs(apts$StdResid) > 3 | apts$CooksD > 1)
bad_idx
## named integer(0)
apts_clean <- if(length(bad_idx)) apts[-bad_idx, ] else apts
n <- nrow(apts)
cutoff <- 4/n
cutoff
## [1] 0.04705882
bad_idx_alt <- which(abs(apts$StdResid) > 3 | apts$CooksD > cutoff)
bad_idx_alt
## 22 33 38 53 55
## 22 33 38 53 55
library(ggplot2)
df_plot <- data.frame(
Fitted = as.numeric(scale(fitted(fit2))),
Residuals = apts$StdResid
)
ggplot(df_plot, aes(x = Fitted, y = Residuals)) +
geom_point(color = "darkred", size = 2) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray40") +
labs(
title = "Residuals vs Fitted (fit2)",
x = "Standardized Fitted Values",
y = "Standardized Residuals"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
axis.title = element_text(face = "bold")
)
library(ggplot2)
ggplot(apts, aes(x = StdResid)) +
geom_histogram(aes(y = ..density..), bins = 12,
fill = "blueviolet", color = "white") +
labs(title = "Histogram of Standardized Residuals",
x = "Standardized Residuals", y = "Density") +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
qqnorm(apts$StdResid); qqline(apts$StdResid, col = 2)
shapiro.test(apts$StdResid)
##
## Shapiro-Wilk normality test
##
## data: apts$StdResid
## W = 0.95306, p-value = 0.00366
fit2_clean <- lm(Price ~ Age + Distance, data = apts_clean)
summary(fit2_clean)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apts_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = apts_clean)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = apts_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -459.92 -200.66 -57.48 260.08 594.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2301.667 94.271 24.415 < 2e-16 ***
## Age -6.799 3.110 -2.186 0.03172 *
## Distance -18.045 2.758 -6.543 5.28e-09 ***
## ParkingYes 196.168 62.868 3.120 0.00251 **
## BalconyYes 1.935 60.014 0.032 0.97436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared: 0.5004, Adjusted R-squared: 0.4754
## F-statistic: 20.03 on 4 and 80 DF, p-value: 1.849e-11
anova(fit2_clean, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 82 6720983
## 2 80 5991088 2 729894 4.8732 0.01007 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = apts_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -459.92 -200.66 -57.48 260.08 594.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2301.667 94.271 24.415 < 2e-16 ***
## Age -6.799 3.110 -2.186 0.03172 *
## Distance -18.045 2.758 -6.543 5.28e-09 ***
## ParkingYes 196.168 62.868 3.120 0.00251 **
## BalconyYes 1.935 60.014 0.032 0.97436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared: 0.5004, Adjusted R-squared: 0.4754
## F-statistic: 20.03 on 4 and 80 DF, p-value: 1.849e-11
apts_clean$Fitted_fit3 <- fitted(fit3)
apts_clean$Resid_fit3 <- residuals(fit3)
apts_clean[apts_clean$ID == 2, c("Fitted_fit3","Resid_fit3")]
## # A tibble: 1 × 2
## Fitted_fit3 Resid_fit3
## <dbl> <dbl>
## 1 2357. 443.