# Define the relevant columns
relevant_columns <- c(
"team_id","overall", "attack", "midfield", "defence", "transfer_budget_eur",
"club_worth_eur", "starting_xi_average_age", "whole_team_average_age",
"coach_id", "international_prestige", "domestic_prestige", "rival_team"
)
# Subset the 'Fifa' dataframe to keep only the relevant columns
Fifa_cleaned <- Fifa[, relevant_columns]## 'data.frame': 385055 obs. of 13 variables:
## $ team_id : int 10 73 243 1337 5 9 21 241 1318 44 ...
## $ overall : int 85 85 85 85 84 84 84 84 84 83 ...
## $ attack : int 85 87 85 82 83 84 78 84 85 85 ...
## $ midfield : int 86 83 86 85 83 82 86 85 83 83 ...
## $ defence : int 86 83 84 82 83 86 83 82 83 83 ...
## $ transfer_budget_eur : int NA NA NA NA NA NA NA NA NA NA ...
## $ club_worth_eur :integer64 -2147483648 -2147483648 -2147483648 NA -2147483648 -2147483648 -2147483648 -2147483648 ...
## $ starting_xi_average_age: num 27.1 26.8 28.6 28.3 26.7 ...
## $ whole_team_average_age : num 25.9 26 24.6 27.2 24.6 ...
## $ coach_id : int 455361 524011 455800 37352367 452683 455353 458813 184942 474589 128160 ...
## $ international_prestige : int 10 9 10 10 8 9 10 10 8 7 ...
## $ domestic_prestige : int 10 10 10 NA 9 9 10 10 NA 9 ...
## $ rival_team : int 11 219 241 1318 1 11 22 243 1337 47 ...
## team_id overall attack midfield
## Min. : 1 Min. :50.0 Min. :46.00 Min. :48.00
## 1st Qu.: 485 1st Qu.:65.0 1st Qu.:66.00 1st Qu.:65.00
## Median : 1877 Median :69.0 Median :69.00 Median :68.00
## Mean : 44058 Mean :69.1 Mean :69.62 Mean :68.91
## 3rd Qu.:110827 3rd Qu.:72.0 3rd Qu.:73.00 3rd Qu.:72.00
## Max. :116361 Max. :86.0 Max. :93.00 Max. :89.00
##
## defence transfer_budget_eur club_worth_eur
## Min. :48.00 Min. : 100000 Min. :-2147483648
## 1st Qu.:64.00 1st Qu.: 1200000 1st Qu.: 5000000
## Median :68.00 Median : 2900000 Median : 13000000
## Mean :68.34 Mean : 8070019 Mean : 48005629
## 3rd Qu.:72.00 3rd Qu.: 7000000 3rd Qu.: 51000000
## Max. :87.00 Max. :190900000 Max. : 2100000000
## NA's :31922 NA's : 94962
## starting_xi_average_age whole_team_average_age coach_id
## Min. :18.82 Min. :19.00 Min. : 24
## 1st Qu.:25.18 1st Qu.:23.96 1st Qu.: 219788
## Median :26.36 Median :24.88 Median : 467859
## Mean :26.31 Mean :24.95 Mean : 6398032
## 3rd Qu.:27.45 3rd Qu.:25.86 3rd Qu.: 1553390
## Max. :33.64 Max. :33.11 Max. :37666625
## NA's :2328
## international_prestige domestic_prestige rival_team
## Min. : 1.000 Min. : 1.000 Min. : 1
## 1st Qu.: 1.000 1st Qu.: 4.000 1st Qu.: 300
## Median : 3.000 Median : 6.000 Median : 1463
## Mean : 3.797 Mean : 6.643 Mean : 34452
## 3rd Qu.: 5.000 3rd Qu.: 9.000 3rd Qu.:101151
## Max. :20.000 Max. :20.000 Max. :116361
## NA's :25829
## team_id overall attack
## 0 0 0
## midfield defence transfer_budget_eur
## 0 0 31922
## club_worth_eur starting_xi_average_age whole_team_average_age
## 94962 0 0
## coach_id international_prestige domestic_prestige
## 2328 0 25829
## rival_team
## 0
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies
percentmiss <- function(x){ sum(is.na(x))/length(x) * 100 }
missing <- apply(Fifa_cleaned, 1, percentmiss)
table(missing)## missing
## 0 7.69230769230769 15.3846153846154
## 347350 11782 25923
# Assuming 'Fifa_cleaned' is your dataset
replace_rows <- Fifa_cleaned[rowSums(is.na(Fifa_cleaned)) == 0, ]… ## Outliers
## 'data.frame': 282672 obs. of 13 variables:
## $ team_id : int 9 10 73 5 21 243 11 240 241 44 ...
## $ overall : int 85 85 85 84 84 84 83 83 83 82 ...
## $ attack : int 86 84 89 84 92 85 82 84 83 82 ...
## $ midfield : int 84 87 82 85 85 86 83 82 85 82 ...
## $ defence : int 85 86 84 83 82 83 81 82 81 83 ...
## $ transfer_budget_eur : int 95000000 176000000 160000000 85000000 100000000 150000000 175000000 68600000 130000000 99500000 ...
## $ club_worth_eur :integer64 -2147483648 -2147483648 -2147483648 -2147483648 -2147483648 -2147483648 -2147483648 1200000000 ...
## $ starting_xi_average_age: num 27.7 26.1 26.2 27.4 26.8 ...
## $ whole_team_average_age : num 24.8 23.8 25.5 24.4 24.5 ...
## $ coach_id : int 455353 455361 524011 452683 458813 455800 456113 452946 184942 128160 ...
## $ international_prestige : int 9 10 9 8 10 10 8 9 10 7 ...
## $ domestic_prestige : int 9 10 10 8 10 10 9 9 10 9 ...
## $ rival_team : int 11 11 219 1 22 241 9 243 243 47 ...
# List of numeric columns
numeric_cols <- c("overall", "attack", "midfield", "defence",
"starting_xi_average_age", "whole_team_average_age",
"coach_id", "international_prestige", "domestic_prestige", "rival_team")
# Loop through each numeric column and plot histograms
for (col in numeric_cols) {
hist(replace_rows[[col]], main = col, xlab = col)
}Null Hypothesis (H0): There is no significant relationship between the transfer budget and performance Alternative Hypothesis (H1): There is a significant relationship between the transfer budget and performance # Conduct t-tests
# Between transfer_budget_eur and Overall attribute
t_test_Overall <- t.test(replace_rows$transfer_budget_eur, replace_rows$overall)
print(t_test_Overall)##
## Welch Two Sample t-test
##
## data: replace_rows$transfer_budget_eur and replace_rows$overall
## t = 255.78, df = 282671, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 8716012 8850619
## sample estimates:
## mean of x mean of y
## 8.783385e+06 6.894352e+01
# Between transfer_budget_eur and Club Worth
# Remove rows with missing values in transfer_budget_eur and club_worth_eur columns
replace_rows_complete <- replace_rows[complete.cases(replace_rows[, c("transfer_budget_eur", "club_worth_eur")]), ]
# Perform t-test between transfer_budget_eur and club_worth_eur
t_test_club_worth <- t.test(replace_rows_complete$transfer_budget_eur, replace_rows_complete$club_worth_eur)
#t_test_club_worth<- t.test(replace_rows$transfer_budget_eur, replace_rows$club_worth_eur)
print(t_test_club_worth)##
## Welch Two Sample t-test
##
## data: replace_rows_complete$transfer_budget_eur and replace_rows_complete$club_worth_eur
## t = -2957.9, df = 279212, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -68294474 -68204026
## sample estimates:
## mean of x mean of y
## 7.349660e+06 3.735082e-316
# Between transfer_budget_eur and attack
t_test_attack<- t.test(replace_rows$transfer_budget_eur, replace_rows$attack)
print(t_test_attack)##
## Welch Two Sample t-test
##
## data: replace_rows$transfer_budget_eur and replace_rows$attack
## t = 255.78, df = 282671, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 8716012 8850619
## sample estimates:
## mean of x mean of y
## 8.783385e+06 6.949291e+01
# Between transfer_budget_eur and midfield
t_test_midfield<- t.test(replace_rows$transfer_budget_eur, replace_rows$midfield)
print(t_test_midfield)##
## Welch Two Sample t-test
##
## data: replace_rows$transfer_budget_eur and replace_rows$midfield
## t = 255.78, df = 282671, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 8716012 8850619
## sample estimates:
## mean of x mean of y
## 8.783385e+06 6.890881e+01
# Between transfer_budget_eur and defence
t_test_defence<- t.test(replace_rows$transfer_budget_eur, replace_rows$defence)
print(t_test_defence)##
## Welch Two Sample t-test
##
## data: replace_rows$transfer_budget_eur and replace_rows$defence
## t = 255.78, df = 282671, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 8716013 8850620
## sample estimates:
## mean of x mean of y
## 8.783385e+06 6.829902e+01
# Between transfer_budget_eur and Internation Prestige
t_test_int_prestige<- t.test(replace_rows$transfer_budget_eur, replace_rows$international_prestige)
print(t_test_int_prestige)##
## Welch Two Sample t-test
##
## data: replace_rows$transfer_budget_eur and replace_rows$international_prestige
## t = 255.78, df = 282671, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 8716078 8850685
## sample estimates:
## mean of x mean of y
## 8.783385e+06 2.671446e+00
# Between transfer_budget_eur and domestic prestige
t_test_domestic_prestige<- t.test(replace_rows$transfer_budget_eur, replace_rows$domestic_prestige)
print(t_test_domestic_prestige)##
## Welch Two Sample t-test
##
## data: replace_rows$transfer_budget_eur and replace_rows$domestic_prestige
## t = 255.78, df = 282671, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 8716076 8850683
## sample estimates:
## mean of x mean of y
## 8.783385e+06 5.497488e+00
# Between transfer_budget_eur and rival_team
t_test_rival_team<- t.test(replace_rows$transfer_budget_eur, replace_rows$rival_team)
print(t_test_rival_team)##
## Welch Two Sample t-test
##
## data: replace_rows$transfer_budget_eur and replace_rows$rival_team
## t = 254.73, df = 282675, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 8679807 8814414
## sample estimates:
## mean of x mean of y
## 8783384.65 36274.36
# Scatter plot for Overall attribute with regression line
plot(replace_rows$transfer_budget_eur, replace_rows$overall,
xlab = "Transfer Budget (EUR)", ylab = "Overall Rating",
main = "Scatter Plot: Transfer Budget vs Overall Rating")
abline(lm(overall ~ transfer_budget_eur, data = replace_rows), col = "red")# Scatter plot for Attack attribute with regression line
plot(replace_rows$transfer_budget_eur, replace_rows$attack,
xlab = "Transfer Budget (EUR)", ylab = "Attack Rating",
main = "Scatter Plot: Transfer Budget vs Attack Rating")
abline(lm(attack ~ transfer_budget_eur, data = replace_rows), col = "blue")# Scatter plot for Midfield attribute with regression line
plot(replace_rows$transfer_budget_eur, replace_rows$midfield,
xlab = "Transfer Budget (EUR)", ylab = "Midfield Rating",
main = "Scatter Plot: Transfer Budget vs Midfield Rating")
abline(lm(midfield ~ transfer_budget_eur, data = replace_rows), col = "green")# Scatter plot for Midfield attribute with regression line
plot(replace_rows$transfer_budget_eur, replace_rows$defence,
xlab = "Transfer Budget (EUR)", ylab = "Defence Rating",
main = "Scatter Plot: Transfer Budget vs Defence Rating")
abline(lm(midfield ~ transfer_budget_eur, data = replace_rows), col = "yellow")