library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
## Warning: package 'magrittr' was built under R version 4.4.2
library(ggplot2)
library(corrplot)
## corrplot 0.94 loaded
football_stadiums <- read.csv("Football Stadiums.csv", header = TRUE, stringsAsFactors = FALSE)
print("First few rows of the dataset:")
## [1] "First few rows of the dataset:"
print(head(football_stadiums))
## Confederation Stadium City HomeTeams
## 1 UEFA Stadiumi Besëlidhja Lezhë Besëlidhja
## 2 UEFA Stadiumi Flamurtari Vlorë Flamurtari Vlorë
## 3 UEFA Stadiumi Laçi Laçi KF Laçi
## 4 UEFA Stadiumi Niko Dovana Durrës Teuta
## 5 UEFA Stadiumi Selman Stërmasi Tirana KF Tirana, Dinamo, Partizani
## 6 UEFA Stadiumi Skënderbeu Korçë Skënderbeu
## Capacity Country IOC Population
## 1 7000 Albania ALB 2876591
## 2 8200 Albania ALB 2876591
## 3 5000 Albania ALB 2876591
## 4 12040 Albania ALB 2876591
## 5 9500 Albania ALB 2876591
## 6 7500 Albania ALB 2876591
print("Column names in the dataset:")
## [1] "Column names in the dataset:"
print(colnames(football_stadiums))
## [1] "Confederation" "Stadium" "City" "HomeTeams"
## [5] "Capacity" "Country" "IOC" "Population"
print("Number of missing values in each column:")
## [1] "Number of missing values in each column:"
print(colSums(is.na(football_stadiums)))
## Confederation Stadium City HomeTeams Capacity
## 0 0 0 0 0
## Country IOC Population
## 0 0 0
numeric_columns <- football_stadiums %>%
select_if(is.numeric)
print("Summary statistics for numeric columns:")
## [1] "Summary statistics for numeric columns:"
print(summary(numeric_columns))
## Capacity Population
## Min. : 244 Min. :3.219e+04
## 1st Qu.: 7478 1st Qu.:1.029e+07
## Median : 16408 Median :3.843e+07
## Mean : 22905 Mean :9.965e+07
## 3rd Qu.: 32005 3rd Qu.:8.081e+07
## Max. :153000 Max. :1.404e+09
ggplot(football_stadiums, aes(x = numeric_columns[[1]])) +
geom_histogram(binwidth = 10, fill = "blue", color = "black") +
ggtitle("Histogram of First Numeric Column") +
xlab(colnames(numeric_columns)[1]) +
ylab("Count") +
theme_minimal()
# Descriptive Analysis: This is a histogram showing the distribution of
the first numeric column. # Adjusting the ‘binwidth’ parameter can help
visualize the spread more effectively.
correlation_matrix <- cor(numeric_columns, use = "complete.obs")
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(correlation_matrix)
## Capacity Population
## Capacity 1.0000000 0.3211247
## Population 0.3211247 1.0000000
corrplot(correlation_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45, title = "Correlation Matrix", mar = c(0, 0, 1, 0))
# Correlation Analysis: This matrix and plot show the pairwise
correlations between the numeric variables in the dataset. # Stronger
correlations are indicated by more intense colors, helping identify
relationships between variables.
if ("Capacity" %in% colnames(football_stadiums)) {
regression_model <- lm(Capacity ~ ., data = numeric_columns)
print("Regression Analysis Summary:")
print(summary(regression_model))
} else {
print("The column 'Capacity' does not exist in the dataset.")
}
## [1] "Regression Analysis Summary:"
##
## Call:
## lm(formula = Capacity ~ ., data = numeric_columns)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53790 -14168 -5755 9007 123626
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.005e+04 4.748e+02 42.23 <2e-16 ***
## Population 2.862e-05 1.877e-06 15.25 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19640 on 2022 degrees of freedom
## Multiple R-squared: 0.1031, Adjusted R-squared: 0.1027
## F-statistic: 232.5 on 1 and 2022 DF, p-value: < 2.2e-16
print("Base R Summary for Numeric Columns:")
## [1] "Base R Summary for Numeric Columns:"
numeric_columns_base <- football_stadiums[, sapply(football_stadiums, is.numeric)]
print(summary(numeric_columns_base))
## Capacity Population
## Min. : 244 Min. :3.219e+04
## 1st Qu.: 7478 1st Qu.:1.029e+07
## Median : 16408 Median :3.843e+07
## Mean : 22905 Mean :9.965e+07
## 3rd Qu.: 32005 3rd Qu.:8.081e+07
## Max. :153000 Max. :1.404e+09