library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr)
## Warning: package 'magrittr' was built under R version 4.4.2
library(ggplot2)
library(corrplot)
## corrplot 0.94 loaded

============================

2. Load and Inspect the Dataset

============================

Load the CSV file

football_stadiums <- read.csv("Football Stadiums.csv", header = TRUE, stringsAsFactors = FALSE)

Check the first few rows

print("First few rows of the dataset:")
## [1] "First few rows of the dataset:"
print(head(football_stadiums))
##   Confederation                  Stadium   City                    HomeTeams
## 1          UEFA      Stadiumi Besëlidhja  Lezhë                   Besëlidhja
## 2          UEFA      Stadiumi Flamurtari  Vlorë             Flamurtari Vlorë
## 3          UEFA            Stadiumi Laçi   Laçi                      KF Laçi
## 4          UEFA     Stadiumi Niko Dovana Durrës                        Teuta
## 5          UEFA Stadiumi Selman Stërmasi Tirana KF Tirana, Dinamo, Partizani
## 6          UEFA      Stadiumi Skënderbeu  Korçë                   Skënderbeu
##   Capacity Country IOC Population
## 1     7000 Albania ALB    2876591
## 2     8200 Albania ALB    2876591
## 3     5000 Albania ALB    2876591
## 4    12040 Albania ALB    2876591
## 5     9500 Albania ALB    2876591
## 6     7500 Albania ALB    2876591

Check column names

print("Column names in the dataset:")
## [1] "Column names in the dataset:"
print(colnames(football_stadiums))
## [1] "Confederation" "Stadium"       "City"          "HomeTeams"    
## [5] "Capacity"      "Country"       "IOC"           "Population"

Check for missing values

print("Number of missing values in each column:")
## [1] "Number of missing values in each column:"
print(colSums(is.na(football_stadiums)))
## Confederation       Stadium          City     HomeTeams      Capacity 
##             0             0             0             0             0 
##       Country           IOC    Population 
##             0             0             0

============================

3. Ensure Numeric Columns for Analysis

============================

Select numeric columns only

numeric_columns <- football_stadiums %>%
  select_if(is.numeric)

Check summary statistics for numeric columns

print("Summary statistics for numeric columns:")
## [1] "Summary statistics for numeric columns:"
print(summary(numeric_columns))
##     Capacity        Population       
##  Min.   :   244   Min.   :3.219e+04  
##  1st Qu.:  7478   1st Qu.:1.029e+07  
##  Median : 16408   Median :3.843e+07  
##  Mean   : 22905   Mean   :9.965e+07  
##  3rd Qu.: 32005   3rd Qu.:8.081e+07  
##  Max.   :153000   Max.   :1.404e+09

============================

4. Descriptive Analysis: Tables and Figures

============================

Example: Histogram of the first numeric column

ggplot(football_stadiums, aes(x = numeric_columns[[1]])) +
  geom_histogram(binwidth = 10, fill = "blue", color = "black") +
  ggtitle("Histogram of First Numeric Column") +
  xlab(colnames(numeric_columns)[1]) +
  ylab("Count") +
  theme_minimal()

# Descriptive Analysis: This is a histogram showing the distribution of the first numeric column. # Adjusting the ‘binwidth’ parameter can help visualize the spread more effectively.

============================

5. Correlation Analysis

============================

Calculate the correlation matrix

correlation_matrix <- cor(numeric_columns, use = "complete.obs")

Visualize the correlation matrix

print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(correlation_matrix)
##             Capacity Population
## Capacity   1.0000000  0.3211247
## Population 0.3211247  1.0000000

The ‘corrplot’ function provides a visual representation of the correlation matrix. The ‘color’ method is used to show the strength of correlations.

corrplot(correlation_matrix, method = "color", type = "upper", 
         tl.col = "black", tl.srt = 45, title = "Correlation Matrix", mar = c(0, 0, 1, 0))

# Correlation Analysis: This matrix and plot show the pairwise correlations between the numeric variables in the dataset. # Stronger correlations are indicated by more intense colors, helping identify relationships between variables.

============================

6. Regression Analysis

============================

Example: Linear Regression Model

Assuming ‘Capacity’ is the dependent variable and others are predictors

if ("Capacity" %in% colnames(football_stadiums)) {
  regression_model <- lm(Capacity ~ ., data = numeric_columns)
print("Regression Analysis Summary:")
  print(summary(regression_model))
} else {
  print("The column 'Capacity' does not exist in the dataset.")
}
## [1] "Regression Analysis Summary:"
## 
## Call:
## lm(formula = Capacity ~ ., data = numeric_columns)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -53790 -14168  -5755   9007 123626 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.005e+04  4.748e+02   42.23   <2e-16 ***
## Population  2.862e-05  1.877e-06   15.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19640 on 2022 degrees of freedom
## Multiple R-squared:  0.1031, Adjusted R-squared:  0.1027 
## F-statistic: 232.5 on 1 and 2022 DF,  p-value: < 2.2e-16

============================

7. Alternative Base R Approach (if %>% Fails)

============================

Base R summary without pipes

print("Base R Summary for Numeric Columns:")
## [1] "Base R Summary for Numeric Columns:"
numeric_columns_base <- football_stadiums[, sapply(football_stadiums, is.numeric)]
print(summary(numeric_columns_base))
##     Capacity        Population       
##  Min.   :   244   Min.   :3.219e+04  
##  1st Qu.:  7478   1st Qu.:1.029e+07  
##  Median : 16408   Median :3.843e+07  
##  Mean   : 22905   Mean   :9.965e+07  
##  3rd Qu.: 32005   3rd Qu.:8.081e+07  
##  Max.   :153000   Max.   :1.404e+09