mydata <- read.table("AllTimeRankingByCountry.csv",
header = TRUE,
sep = ",",
dec = ".")
colnames(mydata)[colnames(mydata) == "Goals.For"] <- "GoalsFor"
colnames(mydata)[colnames(mydata) == "Goals.Against"] <- "GoalsAgainst"
colnames(mydata)[colnames(mydata) == "Goal.Diff"] <- "GoalDiff"
mydata$PositiveGD <- factor(ifelse(mydata$GoalDiff > 0, 1, 0),
levels = c(0, 1),
labels = c("Negative", "Positive"))
champions <- subset(mydata, Titles >= 5)
Explanation: - variable names contained dots, such as Goals.For, Goals.Against, and Goal.Diff. - were renamed to GoalsFor, GoalsAgainst, and GoalDiff
Explanation: - I created a new variable to classify countries into two groups based on their total points: “Positive” if their points are greater than or equal to the median. “Negative” otherwise.
Explanation: - I created a new data frame called champions teams with more then 5 titles.
head(mydata)
## X Country Participated Titles Played Win Draw Loss GoalsFor GoalsAgainst Pts GoalDiff PositiveGD
## 1 1 Spain 148 19 1349 705 306 338 2427 1446 1716 981 Positive
## 2 2 England 136 14 1239 655 271 313 2218 1266 1581 952 Positive
## 3 3 Germany 166 8 1176 554 242 380 2070 1538 1350 532 Positive
## 4 4 Italy 138 12 1086 508 278 300 1662 1176 1294 486 Positive
## 5 5 France 115 1 794 331 175 288 1187 981 837 206 Positive
## 6 6 Portugal 106 4 680 280 156 244 1006 857 716 149 Positive
Explanation: - Country: the name of the country (categorical variable). - Participated: number of times the country has participated (numeric). - Titles: number of titles (championships) won (numeric). - Played: total number of games played (numeric). - Win: number of games won (numeric). - Draw: number of games drawn (numeric). - Loss: number of games lost (numeric). - GoalsFor: number of goals scored by the country (numeric). - GoalsAgainst: number of goals conceded by the country (numeric). - Pts: total points achieved (numeric). - GoalDiff: goal difference (GoalsFor − GoalsAgainst) (numeric).
Unit = one country Sample = the 54 countries
summary(mydata[, -1])
## Country Participated Titles Played Win Draw Loss
## Length:54 Min. : 5.00 Min. : 0.000 Min. : 15.0 Min. : 1.0 Min. : 1.00 Min. : 7.0
## Class :character 1st Qu.: 27.00 1st Qu.: 0.000 1st Qu.: 79.5 1st Qu.: 20.5 1st Qu.: 16.25 1st Qu.: 42.5
## Mode :character Median : 58.50 Median : 0.000 Median : 170.5 Median : 55.0 Median : 38.00 Median : 97.5
## Mean : 57.63 Mean : 1.241 Mean : 289.3 Mean :113.2 Mean : 62.85 Mean :113.2
## 3rd Qu.: 73.75 3rd Qu.: 0.000 3rd Qu.: 336.5 3rd Qu.:126.5 3rd Qu.: 75.25 3rd Qu.:148.5
## Max. :166.00 Max. :19.000 Max. :1349.0 Max. :705.0 Max. :306.00 Max. :380.0
## GoalsFor GoalsAgainst Pts GoalDiff PositiveGD
## Min. : 9.0 Min. : 23.0 Min. : 5.00 Min. :-356.00 Negative:42
## 1st Qu.: 84.0 1st Qu.: 130.2 1st Qu.: 57.25 1st Qu.:-100.75 Positive:12
## Median : 196.0 Median : 357.5 Median : 142.50 Median : -44.50
## Mean : 410.1 Mean : 410.1 Mean : 289.26 Mean : 0.00
## 3rd Qu.: 468.5 3rd Qu.: 535.0 3rd Qu.: 318.50 3rd Qu.: -5.25
## Max. :2427.0 Max. :1538.0 Max. :1716.00 Max. : 981.00
mydata <- na.omit(mydata)
mydata_highpoints <- subset(mydata, Pts >= 500)
head(mydata_highpoints)
## X Country Participated Titles Played Win Draw Loss GoalsFor GoalsAgainst Pts GoalDiff PositiveGD
## 1 1 Spain 148 19 1349 705 306 338 2427 1446 1716 981 Positive
## 2 2 England 136 14 1239 655 271 313 2218 1266 1581 952 Positive
## 3 3 Germany 166 8 1176 554 242 380 2070 1538 1350 532 Positive
## 4 4 Italy 138 12 1086 508 278 300 1662 1176 1294 486 Positive
## 5 5 France 115 1 794 331 175 288 1187 981 837 206 Positive
## 6 6 Portugal 106 4 680 280 156 244 1006 857 716 149 Positive
describe(mydata[, c(-1)])
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Country* 1 54 27.50 15.73 27.5 27.50 20.02 1 54 53 0.00 -1.27 2.14
## Participated 2 54 57.63 36.80 58.5 53.43 43.74 5 166 161 0.93 0.49 5.01
## Titles 3 54 1.24 3.73 0.0 0.18 0.00 0 19 19 3.31 10.58 0.51
## Played 4 54 289.26 313.87 170.5 223.80 154.19 15 1349 1334 1.93 3.20 42.71
## Win 5 54 113.20 158.58 55.0 75.84 65.98 1 705 704 2.36 5.09 21.58
## Draw 6 54 62.85 72.42 38.0 47.80 36.32 1 306 305 1.91 3.05 9.85
## Loss 7 54 113.20 88.73 97.5 100.16 80.80 7 380 373 1.19 0.81 12.07
## GoalsFor 8 54 410.13 549.02 196.0 283.75 239.44 9 2427 2418 2.29 4.78 74.71
## GoalsAgainst 9 54 410.13 352.13 357.5 351.70 315.05 23 1538 1515 1.45 1.82 47.92
## Pts 10 54 289.26 388.34 142.5 199.48 160.12 5 1716 1711 2.28 4.68 52.85
## GoalDiff 11 54 0.00 241.81 -44.5 -41.95 75.61 -356 981 1337 2.56 7.41 32.91
## PositiveGD* 12 54 1.22 0.42 1.0 1.16 0.00 1 2 1 1.30 -0.32 0.06
mean(mydata$Pts)
## [1] 289.2593
median(mydata$Titles)
## [1] 0
sd(mydata$Win)
## [1] 158.5809
min(mydata$Draw)
## [1] 1
max(mydata$Loss)
## [1] 380
describeBy(x = mydata$Pts,
group = mydata$HighPoints)
## Warning in describeBy(x = mydata$Pts, group = mydata$HighPoints): no grouping variable requested
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 54 289.26 388.34 142.5 199.48 160.12 5 1716 1711 2.28 4.68 52.85
hist(mydata$Pts,
ylab = "Number of countries",
xlab = "Points",
main = "Distribution of points",
col = "skyblue",
border = "black")
Explanation: The histogram shows that most countries have relatively few points, while only a small number of countries achieve very high totals. This indicates that football performance is dominated by a few strong countries.
boxplot(mydata$GoalsFor,
ylab = "Goals scored",
main = "Boxplot of goals scored by countries",
col = "lightblue")
Explanation: The boxplot shows the distribution of goals scored by different countries. We see the median and interquartile range, but also some extreme values (outliers). This suggests that while most scored relatively few goals, a few countries have scored many.
plot(mydata$GoalsFor, mydata$GoalsAgainst,
xlab = "Goals For",
ylab = "Goals Against",
main = "Scatterplot of Goals For vs. Goals Against",
pch = 19, col = "darkgreen")
Explanation: The scatterplot shows the relationship between goals scored and goals conceded. Countries that score more goals also tend to concede more goals, which makes sense because stronger teams usually play more matches. This visualization highlights differences in team strategies: some teams are very offensive (high GoalsFor), while others are more defensive.
library(readxl)
mydata <- read_xlsx("./Business School.xlsx")
mydata <- as.data.frame(mydata)
head(mydata)
## Student ID Undergrad Degree Undergrad Grade MBA Grade Work Experience Employability (Before) Employability (After)
## 1 1 Business 68.4 90.2 No 252 276
## 2 2 Computer Science 70.2 68.7 Yes 101 119
## 3 3 Finance 76.4 83.3 No 401 462
## 4 4 Business 82.6 88.7 No 287 342
## 5 5 Finance 76.9 75.4 No 275 347
## 6 6 Computer Science 83.3 82.1 No 254 313
## Status Annual Salary
## 1 Placed 111000
## 2 Placed 107000
## 3 Placed 109000
## 4 Placed 148000
## 5 Placed 255500
## 6 Placed 103500
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(mydata, aes(x = `Undergrad Degree`)) +
geom_bar(fill = "skyblue", colour = "black") +
ylab("Number of students") +
xlab("Undergraduate Degree") +
ggtitle("Distribution of Undergraduate Degrees")
Explanation: The most common undergraduate degree is Business we can also show the distribution of mostly common degre buy the next step:
table(mydata$`Undergrad Degree`)
##
## Art Business Computer Science Engineering Finance
## 6 35 25 9 25
Explanation: - In my opinion this type of data is more suitable.
names(mydata) <- make.names(names(mydata))
mydata$Annual.Salary <- as.numeric(gsub(",", "", mydata$Annual.Salary))
mydata <- na.omit(mydata)
summary(mydata$Annual.Salary)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20000 87125 103500 109058 124000 340000
mean(mydata$Annual.Salary)
## [1] 109058
sd(mydata$Annual.Salary)
## [1] 41501.49
Explanation: - Firstly i changed “Annual Salary” → “Annual.Salary” with names(mydata) <- make.names(names(mydata)). - Some salary values are stored like “120,000”. R thinks this is text (character), not a number gsub(“,”, ““, …) removes the commas, turning”120000” into a number.
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
ggplot(mydata, aes(x = Annual.Salary)) +
geom_histogram(binwidth = 5000, fill = "lightgreen", colour = "black") +
ylab("Number of students") +
xlab("Annual Salary") +
ggtitle("Distribution of Annual Salary") +
scale_x_continuous(labels = comma)
Explanation: - ibrary(scales) needed for ‘comma’ function - scale_x_continuous(labels = comma) → formats the x-axis values with commas (e.g., 100000 → 100,000). to be numerical - Most salaries are clustered around 80,000 and 120,000. - The distribution is right-skewed,a few students earn much higher salaries (up to 300,000).
t.test(mydata$MBA.Grade, mu = 74)
##
## One Sample t-test
##
## data: mydata$MBA.Grade
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
Explanation: - Null hypothesis: H_0: mu = 74 (the average MBA grade is equal to 74). - Alternative hypothesis: H_1: mu is not eq. to 74. - The sample mean MBA grade is 76.04, which is higher than 74. - The t-test gives p-value = 0.009 < 0.05, so we reject H_0.
library(effectsize)
##
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
##
## phi
cohens_d(mydata$MBA.Grade, mu = 74)
## Cohen's d | 95% CI
## ------------------------
## 0.27 | [0.07, 0.46]
##
## - Deviation from a difference of 74.
Explanation: - The effect size was small (Cohen’s d ≈ 0.27), meaning that while the difference is small in practical terms.
library(readxl)
apartments <- read_excel("Apartments.xlsx")
apartments <- na.omit(apartments)
head(apartments)
## # A tibble: 6 × 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Explanation: - I have also deleted missing values at the beginning
Description:
apartments$Parking <- factor(apartments$Parking,
levels = c(0, 1),
labels = c("No", "Yes"))
apartments$Balcony <- factor(apartments$Balcony,
levels = c(0, 1),
labels = c("No", "Yes"))
str(apartments)
## tibble [85 × 5] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:85] 7 18 7 28 18 28 14 18 22 25 ...
## $ Distance: num [1:85] 28 1 28 29 18 12 20 6 7 2 ...
## $ Price : num [1:85] 1640 2800 1660 1850 1640 1770 1850 1970 2270 2570 ...
## $ Parking : Factor w/ 2 levels "No","Yes": 1 2 1 1 2 1 1 2 2 2 ...
## $ Balcony : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 2 2 1 1 ...
Explanation: - I converted the categorical variables Parking and Balcony from numeric coding (0 = No, 1 = Yes) into factors with descriptive labels. I used the factor() function. After this transformation, both variables are shown as factors in the dataset instead of numbers.
t.test(apartments$Price, mu = 1900)
##
## One Sample t-test
##
## data: apartments$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
fit1 <- lm(Price ~ Age, data = apartments)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
r_fit1 <- sign(coef(fit1)["Age"]) * sqrt(summary(fit1)$r.squared)
r_fit1
## Age
## -0.230255
Explanation: I estimated the simple linear regression Price = f(Age) and saved it as fit1. The slope for Age is −8.98, which means the apartment price per m² decreases by about 9 EUR for each additional year of age. The intercept (2185.46) is the predicted price for a new apartment (Age = 0). The coefficient of determination is R² = 0.053, so 5.3% of the variability in price is explained by age. The coefficient of correlation is r ≈ −0.23, indicating a weak negative relationship: older apartments are slightly cheaper on average. Although the Age effect is statistically significant (p = 0.034), its practical impact is small given the low R².
pairs(apartments[, c("Price", "Age", "Distance")],
main = "Scatterplot Matrix: Price, Age, Distance",
pch = 19, col = "blue")
Explanation: - Price vs Age: There is a slight negative trend (older
apartments tend to have lower prices). - Price vs Distance: Prices are
slightly lower for apartments farther from the city center, but the
relationship is not very strong. - Age vs Distance: The points are
scattered randomly, without a clear pattern → this suggests that Age and
Distance are not highly correlated.
fit2 <- lm(Price ~ Age + Distance, data = apartments)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
Explanention - Older apartments and apartments farther from the city center tend to have lower prices per m². Distance has a stronger effect than Age. The model explains about 44% of the price differences, which is good but leaves room for other factors to play a role.
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
vif(fit2)
## Age Distance
## 1.001845 1.001845
Explanation: Both predictors (Age and Distance) have VIF values very close to 1. This means there is no multicollinearity problem. Age and Distance are not strongly correlated with each other. Therefore, both variables can stay in the regression model without distorting the results.
# Standardized residuals
std_resid <- rstandard(fit2)
# Cook's distance
cooks_d <- cooks.distance(fit2)
# Combine into a data frame for inspection
diagnostics <- data.frame(std_resid, cooks_d)
# Show the first few rows
head(diagnostics)
## std_resid cooks_d
## 1 -0.6653487 0.007386569
## 2 1.7832876 0.030365432
## 3 -0.5937629 0.005882612
## 4 0.7543794 0.008299153
## 5 -1.0733987 0.005112584
## 6 -0.7775190 0.004900891
# Rule of thumb for outliers:
# Standardized residuals > |3| are potential outliers
which(abs(std_resid) > 3)
## named integer(0)
# Rule of thumb for Cook's distance:
# Values > 4/n (where n = number of observations) may indicate high influence
n <- nrow(apartments)
which(cooks_d > 4/n)
## 22 33 38 53 55
## 22 33 38 53 55
# Standardized residuals
std_resid <- rstandard(fit2)
# Standardized fitted values
std_fitted <- scale(fitted(fit2))
# Scatterplot of standardized residuals vs standardized fitted values
plot(std_fitted, std_resid,
xlab = "Standardized Fitted Values",
ylab = "Standardized Residuals",
main = "Residuals vs Fitted Values",
pch = 19, col = "blue")
# Add a horizontal line at 0
abline(h = 0, col = "red", lwd = 2)
Explanation: - The plot does not show strong evidence of heteroskedasticity. Residuals are reasonably spread around 0 with no clear pattern, so the homoskedasticity assumption seems acceptable.
std_resid <- rstandard(fit2)
hist(std_resid,
main = "Histogram of Standardized Residuals",
xlab = "Standardized Residuals",
col = "lightblue", border = "black")
# Q-Q plot
qqnorm(std_resid, main = "Q-Q Plot of Standardized Residuals")
qqline(std_resid, col = "red", lwd = 2)
# Formal test
shapiro.test(std_resid)
##
## Shapiro-Wilk normality test
##
## data: std_resid
## W = 0.95306, p-value = 0.00366
Explanation - I tested whether the standardized residuals of model fit2 are normally distributed. - Q-Q Plot: The points mostly follow the straight red line, but there are deviations in the tails, suggesting the residuals are not perfectly normal. - Shapiro-Wilk Test: The p-value is 0.0037 (< 0.05), so I reject the null hypothesis of normality. This means the residuals are not normally distributed.
# Identify influential observations (Cook’s D > 4/n)
n <- nrow(apartments)
cutoff <- 4/n
influential <- which(cooks_d > cutoff)
# Remove these units and re-estimate the model
apartments_clean <- apartments[-influential, ]
fit2_clean <- lm(Price ~ Age + Distance, data = apartments_clean)
summary(fit2_clean)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
Explanation:
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = apartments_clean)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = apartments_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.93 -198.19 -53.64 186.73 518.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2393.316 93.930 25.480 < 2e-16 ***
## Age -7.970 3.191 -2.498 0.0147 *
## Distance -21.961 2.830 -7.762 3.39e-11 ***
## ParkingYes 128.700 60.801 2.117 0.0376 *
## BalconyYes 6.032 57.307 0.105 0.9165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389
## F-statistic: 24.08 on 4 and 75 DF, p-value: 7.764e-13
anova(fit2_clean, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 77 5077362
## 2 75 4791128 2 286234 2.2403 0.1135
Explanation: ANOVA checks whether the additional predictors (Parking, Balcony) significantly reduce the residual variance.
# Hypotheses for Parking
# H0: β_Parking = 0 (Having parking does not affect price per m²)
# H1: β_Parking ≠ 0 (Having parking affects price per m²)
# Hypotheses for Balcony
# H0: β_Balcony = 0 (Having a balcony does not affect price per m²)
# H1: β_Balcony ≠ 0 (Having a balcony affects price per m²)
# Overall regression test (F-statistic at bottom of summary)
# H0: β_Age = β_Distance = β_Parking = β_Balcony = 0
# (None of the predictors explain price per m²)
# H1: At least one β ≠ 0
# (At least one predictor significantly explains price per m²)
# ANOVA comparison of models
# H0: Adding Parking and Balcony does NOT improve the model (fit2 = fit3)
# H1: Adding Parking and Balcony DOES improve the model (fit3 better than fit2)
Explanation: - In fit3, I included Age, Distance, Parking, and Balcony. Parking has a significant positive effect (+128.7 €/m²), while Balcony’s effect is small and not significant (+6 €/m²). The F-statistic at the bottom of the output tests whether all predictors together improve the model compared to a model with no predictors. Since p < 0.001, the model is statistically significant overall.
# Save fitted values and residuals from fit3
apartments_clean$fitted <- fitted(fit3)
apartments_clean$residuals <- residuals(fit3)
# Look at apartment with ID = 2
apartments_clean[2, c("Price", "fitted", "residuals")]
## # A tibble: 1 × 3
## Price fitted residuals
## <dbl> <dbl> <dbl>
## 1 2800 2357. 443.