mydata <- read.csv("RidersSummary.csv", header = TRUE, sep = ",")
head(mydata)
## bike_number rider_name class season home_country motorcycle
## 1 1 Jorge Martin Moto3 2015 Spain Mahindra
## 2 1 Jorge Martin Moto3 2016 Spain Mahindra
## 3 1 Jorge Martin Moto3 2017 Spain Honda
## 4 1 Jorge Martin Moto3 2018 Spain Honda
## 5 1 Jorge Martin Moto2 2019 Spain KTM
## 6 1 Jorge Martin Moto2 2020 Spain Kalex
## team races_participated wins podium pole
## 1 Mapfre Team Mahindra 18 0 0 0
## 2 Pull & Bear Aspar Mahindra Team 16 0 1 0
## 3 Del Conca Gresini Moto3 16 1 9 9
## 4 Del Conca Gresini Moto3 17 7 10 11
## 5 Red Bull KTM Ajo 19 0 2 0
## 6 Red Bull KTM Ajo 13 2 6 1
## fastest_lap points placed world_championships
## 1 0 45 17 0
## 2 0 72 16 0
## 3 2 196 4 0
## 4 3 260 1 1
## 5 1 94 11 0
## 6 2 160 5 0
bike_number: Racing number assigned to each rider
rider_name: Full name of the motorcycle racer
class: Racing category (125cc, Moto3, Moto2, MotoGP, MotoE)
season: Year of competition (2008-2025)
home_country: Rider’s country of origin (8 countries represented)
motorcycle: Brand/model of racing motorcycle
team: Racing team affiliation
races_participated: Number of races entered in a season
wins: Race victories achieved
podium: Top-3 finishes (includes wins)
pole: Pole position starts (fastest qualifying time)
fastest_lap: Fastest lap times recorded during races
points: Championship points earned
placed: The rider’s final position in the championship standings for that season, lower numbers mean a better rank in the overall championship results.
world_championships: Defined as 1 if driver won that year, 0 if not.
win_rate: The percentage of races won in the season.
podium_rate: Calculated as podium finishes divided by races participated.
points_per_race: Total points divided by number of races participated.
career_stage: A label showing the rider’s career phase that season.
season_ranking: The same as ‘placed’—the rider’s final championship position. (Renamed for clarity.)
fastest_laps: The number of times the rider set the fastest lap in a race during the season.
titles_won: The number of championship titles won by the rider in that season
# Load necessary packages
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Read in the dataset
riders <- read.csv("RidersSummary.csv")
# DATA MANIPULATIONS:
# 1. Create new variable
riders <- riders %>%
mutate(win_percentage = (wins / races_participated) * 100)
# 2. Delete units due to missing data
riders_clean <- riders %>%
filter(races_participated > 0)
# 3. Rename variables
riders_clean <- riders_clean %>%
rename(season_ranking = placed,
fastest_laps = fastest_lap)
# 4. Create new data frame based on conditions
motogp_only <- riders_clean %>%
filter(class == "MotoGP")
# SHOW RESULTS:
print(paste("Original dataset:", nrow(riders), "rows"))
## [1] "Original dataset: 274 rows"
print(paste("After cleaning:", nrow(riders_clean), "rows"))
## [1] "After cleaning: 266 rows"
print(paste("MotoGP subset:", nrow(motogp_only), "rows"))
## [1] "MotoGP subset: 128 rows"
# Show sample of new variables
head(riders_clean[, c("rider_name", "season", "wins", "races_participated", "win_percentage")])
## rider_name season wins races_participated win_percentage
## 1 Jorge Martin 2015 0 18 0.00000
## 2 Jorge Martin 2016 0 16 0.00000
## 3 Jorge Martin 2017 1 16 6.25000
## 4 Jorge Martin 2018 7 17 41.17647
## 5 Jorge Martin 2019 0 19 0.00000
## 6 Jorge Martin 2020 2 13 15.38462
# Plot histogram using cleaned data
ggplot(riders_clean, aes(x = win_percentage)) +
geom_histogram(binwidth = 5, fill = "steelblue", color = "black", alpha = 0.7) +
labs(
title = "Distribution of Riders' Win Percentages",
x = "Win Percentage",
y = "Number of Riders"
) +
theme_minimal()
This histogram shows the distribution of riders’ win percentages in motorcycle racing seasons.
Most riders have a win percentage close to 0%, meaning the majority of riders rarely win any races.
There are very few riders with high win percentages (right side of the graph), showing that only a small number of seasons are dominated by exceptional performers (Marquez in 2014, Jorge Martin in 2024)
The shape is right-skewed: almost all the data is crowded near zero, with a long tail stretching toward higher win rates.
This highlights how competitive motorcycle racing is.
# Select key variables for analysis
selected_vars <- c("wins", "podium", "points", "win_percentage", "races_participated")
# Present descriptive statistics
summary(riders_clean[, selected_vars])
## wins podium points win_percentage
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 47.75 1st Qu.: 0.00
## Median : 0.000 Median : 2.000 Median :126.00 Median : 0.00
## Mean : 1.398 Mean : 3.598 Mean :137.01 Mean : 8.29
## 3rd Qu.: 2.000 3rd Qu.: 6.000 3rd Qu.:203.50 3rd Qu.: 11.11
## Max. :13.000 Max. :18.000 Max. :508.00 Max. :100.00
## races_participated
## Min. : 1.00
## 1st Qu.:14.00
## Median :18.00
## Mean :15.13
## 3rd Qu.:18.00
## Max. :20.00
# Additional statistics
cat("Standard Deviations:\n")
## Standard Deviations:
sapply(riders_clean[, selected_vars], sd, na.rm = TRUE)
## wins podium points win_percentage
## 2.479927 4.312316 106.786773 14.923377
## races_participated
## 5.744121
cat("\nNumber of observations:", nrow(riders_clean))
##
## Number of observations: 266
# Show distribution of categorical variable
table(riders_clean$class)
##
## 125cc Moto2 Moto3 MotoE MotoGP
## 12 64 61 1 128
ggplot(riders_clean, aes(x = class, y = points, fill = class)) +
geom_boxplot() +
scale_fill_manual(values = c("MotoGP" = "slateblue4",
"Moto2" = "peru",
"Moto3" = "plum4",
"125cc" = "olivedrab3",
"MotoE" = "skyblue2")) +
labs(
title = "Points per Race by Racing Class",
x = "Racing Class",
y = "Points per Race"
) +
theme_minimal() +
theme(legend.position = "none")
This boxplot shows how points per race are distributed for each motorcycle racing class.
MotoGP (slateblue):
Has the widest spread, highest outliers (dots above the box), and a high median—some rider-seasons score exceptionally many points.
Moto2 and Moto3 (orange and purple):
Boxes are higher and wider than 125cc, meaning more points and more variability. Median points per race are higher—not as much as MotoGP, but still strong.
125cc (green):
Lower box and narrower spread—rider-seasons here generally score fewer points.
MotoE (black line):
Very little spread, indicating less data or consistently low points.
MotoGP has the highest and most variable scoring (top competitors and big gaps).
Entry classes (125cc, MotoE) have lower scores.
Outliers in MotoGP show some exceptional rider-seasons—champions who dominate a season.
The box heights and locations reflect average performance for each class: higher = more points per race.
#install.packages("readxl")
library(readxl)
Business_School <- read_excel("Business School.xlsx")
head(Business_School)
## # A tibble: 6 × 9
## `Student ID` `Undergrad Degree` `Undergrad Grade` `MBA Grade`
## <dbl> <chr> <dbl> <dbl>
## 1 1 Business 68.4 90.2
## 2 2 Computer Science 70.2 68.7
## 3 3 Finance 76.4 83.3
## 4 4 Business 82.6 88.7
## 5 5 Finance 76.9 75.4
## 6 6 Computer Science 83.3 82.1
## # ℹ 5 more variables: `Work Experience` <chr>, `Employability (Before)` <dbl>,
## # `Employability (After)` <dbl>, Status <chr>, `Annual Salary` <dbl>
library(ggplot2)
ggplot(Business_School, aes(x = `Undergrad Degree`, fill = `Undergrad Degree`)) +
geom_bar(color = "white") +
scale_fill_manual(values = c(
"Business" = "slateblue4",
"Engineering" = "peru",
"Art" = "plum4",
"Science" = "olivedrab3",
"Computer Science" = "skyblue2",
"Finance" = "tomato3"
)) +
scale_y_continuous(breaks = seq(0, 50, by = 5)) +
labs(title = "Distribution of Undergraduate Degrees",
x = "Undergraduate Degree",
y = "Frequency")
### Explanation - This bar chart illustrates the distribution of
undergraduate degrees among MBA students. - We can see that Business is
the most prevalent degree, followed by Computer Science and Finance,
which are equal, and Engineering and Art being the least common.
names(Business_School)[names(Business_School) == "Annual Salary"] <- "Annual_Salary"
summary(Business_School$"Annual_Salary")
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20000 87125 103500 109058 124000 340000
ggplot(Business_School, aes(x = Annual_Salary)) +
geom_histogram(binwidth = 20000, fill = "slategray", color = "snow4") +
scale_x_continuous(limits = c(0, 200000)) +
labs(title = "Distribution of Annual Salary",
x = "Annual Salary",
y = "Frequency")
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
warnings=FALSE # attempt to turn off warnings
#Hypothesis
names(Business_School)[names(Business_School) == "MBA Grade"] <- "MBA_Grade"
t.test(Business_School$MBA_Grade, mu = 74)
##
## One Sample t-test
##
## data: Business_School$MBA_Grade
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
The test rejects the null hypothesis that the population mean MBA grade equals 74; the sample mean 76.04 is significantly higher, with p = 0.009 and a 95% CI of 74.52 to 77.56 that does not include 74.
The sample mean is 76.04055, indicating about 2.04 points above 74.
The 95% confidence interval for µ is [74.52, 77.56], implying plausible population means are around mid‑70s and strictly above 74 here.
library(readxl)
apts <- read_excel("Apartments.xlsx")
apts$ID <- 1:nrow(apts)
head(apts)
## # A tibble: 6 × 6
## Age Distance Price Parking Balcony ID
## <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 7 28 1640 0 1 1
## 2 18 1 2800 1 0 2
## 3 7 28 1660 0 0 3
## 4 28 29 1850 0 1 4
## 5 18 18 1640 1 1 5
## 6 28 12 1770 0 1 6
str(apts)
## tibble [85 × 6] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:85] 7 18 7 28 18 28 14 18 22 25 ...
## $ Distance: num [1:85] 28 1 28 29 18 12 20 6 7 2 ...
## $ Price : num [1:85] 1640 2800 1660 1850 1640 1770 1850 1970 2270 2570 ...
## $ Parking : num [1:85] 0 1 0 0 1 0 0 1 1 1 ...
## $ Balcony : num [1:85] 1 0 0 1 1 1 1 1 0 0 ...
## $ ID : int [1:85] 1 2 3 4 5 6 7 8 9 10 ...
summary(apts)
## Age Distance Price Parking
## Min. : 1.00 Min. : 1.00 Min. :1400 Min. :0.0000
## 1st Qu.:12.00 1st Qu.: 4.00 1st Qu.:1710 1st Qu.:0.0000
## Median :18.00 Median :12.00 Median :1950 Median :1.0000
## Mean :18.55 Mean :14.22 Mean :2019 Mean :0.5059
## 3rd Qu.:24.00 3rd Qu.:20.00 3rd Qu.:2290 3rd Qu.:1.0000
## Max. :45.00 Max. :45.00 Max. :2820 Max. :1.0000
## Balcony ID
## Min. :0.0000 Min. : 1
## 1st Qu.:0.0000 1st Qu.:22
## Median :0.0000 Median :43
## Mean :0.4353 Mean :43
## 3rd Qu.:1.0000 3rd Qu.:64
## Max. :1.0000 Max. :85
nrow(apts)
## [1] 85
ncol(apts)
## [1] 6
df2 <- read_excel("Apartments.xlsx")
df2 <- df2 %>%
mutate(
Parking = factor(Parking, levels = c(0,1), labels = c("No","Yes")),
Balcony = factor(Balcony, levels = c(0,1), labels = c("No","Yes"))
)
t.test(apts$Price, mu = 1900, alternative = "two.sided")
##
## One Sample t-test
##
## data: apts$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
library(readxl)
dat <- read_excel("Apartments.xlsx", sheet = "Apartments")
# Simple regression: Price = f(Age)
fit1 <- lm(Price ~ Age, data = dat)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
Estimated regression function: Price = 2185.45 - 8.98 * Age.
The intercept 2185.45 is the estimated Price when Age = 0, and the slope −8.98 indicates the expected change in Price for a one-year increase in Age.
The sample Pearson correlation between Age and Price is r =~ -0.230, showing a weak negative linear association: older apartments tend to have slightly lower prices per square meter, but the linear association is not strong.
The coefficient of determination is R =~ 0.053.
This value indicates that about 5.3% of the variation in Price is explained by Age alone; the remaining 94.7% is due to other factors and unexplained variability.
#install.packages("GGally")
library(readxl)
library(GGally)
dat <- read_excel("Apartments.xlsx", sheet = "Apartments")
GGally::ggpairs(dat[, c("Price", "Age", "Distance")])
library(readxl)
dat <- read_excel("Apartments.xlsx", sheet = "Apartments")
fit2 <- lm(Price ~ Age + Distance, data = dat)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
#install.packages("car")
#install.packages("vif")
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif(fit2)
## Age Distance
## 1.001845 1.001845
library(readxl)
library(car)
dat <- read_excel("Apartments.xlsx", sheet = "Apartments")
fit2 <- lm(Price ~ Age + Distance, data = dat)
stan.resid <- rstandard(fit2)
cooks <- cooks.distance(fit2)
# Identify outliers and high influence cases
prob_units <- which(abs(stan.resid) > 2 | cooks > 4/length(stan.resid))
dat_clean <- dat[-prob_units, ]
library(readxl)
dat <- read_excel("Apartments.xlsx", sheet = "Apartments")
fit2 <- lm(Price ~ Age + Distance, data = dat)
stan.resid <- rstandard(fit2)
stan.fit <- scale(fitted(fit2))
plot(stan.fit, stan.resid, ylab='Standardized residuals', xlab='Standardized fitted values', pch=20)
abline(h=0, lty=2)
library(readxl)
dat <- read_excel("Apartments.xlsx", sheet = "Apartments")
fit2 <- lm(Price ~ Age + Distance, data = dat)
stan.resid <- rstandard(fit2)
hist(stan.resid,
breaks = 20,
col = "grey55",
main = "Histogram of Standardized Residuals",
xlab = "Standardized Residuals")
# Normal Q-Q plot
qqnorm(stan.resid)
qqline(stan.resid)
# Shapiro-Wilk test
shapiro.test(stan.resid)
##
## Shapiro-Wilk normality test
##
## data: stan.resid
## W = 0.95306, p-value = 0.00366
fit2_clean <- lm(Price ~ Age + Distance, data = dat_clean)
summary(fit2_clean)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = dat_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
# Make sure Parking and Balcony are factors
library(readxl)
dat <- read_excel("Apartments.xlsx", sheet = "Apartments")
dat$Parking <- as.factor(dat$Parking)
dat$Balcony <- as.factor(dat$Balcony)
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = dat)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -459.92 -200.66 -57.48 260.08 594.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2301.667 94.271 24.415 < 2e-16 ***
## Age -6.799 3.110 -2.186 0.03172 *
## Distance -18.045 2.758 -6.543 5.28e-09 ***
## Parking1 196.168 62.868 3.120 0.00251 **
## Balcony1 1.935 60.014 0.032 0.97436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared: 0.5004, Adjusted R-squared: 0.4754
## F-statistic: 20.03 on 4 and 80 DF, p-value: 1.849e-11
anova(fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 82 6720983
## 2 80 5991088 2 729894 4.8732 0.01007 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(readxl)
dat <- read_excel("Apartments.xlsx", sheet = "Apartments")
dat$Parking <- as.factor(dat$Parking)
dat$Balcony <- as.factor(dat$Balcony)
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = dat)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -459.92 -200.66 -57.48 260.08 594.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2301.667 94.271 24.415 < 2e-16 ***
## Age -6.799 3.110 -2.186 0.03172 *
## Distance -18.045 2.758 -6.543 5.28e-09 ***
## Parking1 196.168 62.868 3.120 0.00251 **
## Balcony1 1.935 60.014 0.032 0.97436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared: 0.5004, Adjusted R-squared: 0.4754
## F-statistic: 20.03 on 4 and 80 DF, p-value: 1.849e-11
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = dat)
dat$fitted_fit3 <- fitted(fit3)
dat$resid_fit3 <- resid(fit3)
id2_fitted <- dat$fitted_fit3[2]
id2_residual <- dat$resid_fit3[2]
id2_fitted
## 2
## 2357.411
id2_residual
## 2
## 442.5889