# Load necessary libraries
pacman::p_load(pacman, readr, tidyr, forcats, dplyr, ggplot2, patchwork, reshape2, scales, gridExtra)

# Function to load and clean datasets
load_and_clean_data <- function(file_path) {
  data <- read_csv(file_path) %>%
    filter(!is.na(country))
  return(data)
}

# Load the datasets from the CSV files
Companies_ranked_by_Revenue <- load_and_clean_data("Companies_ranked_by_Revenue.csv")
## Rows: 9912 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Name, Symbol, country
## dbl (3): Rank, revenue_ttm, price (GBP)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Companies_ranked_by_Earnings <- load_and_clean_data("Companies_ranked_by_Earnings.csv")
## Rows: 9912 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Name, Symbol, country
## dbl (3): Rank, earnings_ttm, price (GBP)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Companies_ranked_by_Market_Cap <- load_and_clean_data("Companies_ranked_by_Market_Cap.csv")
## Rows: 9912 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Name, Symbol, country
## dbl (3): Rank, marketcap, price (GBP)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Companies_ranked_by_P_E_ratio <- load_and_clean_data("Companies_ranked_by_P_E_ratio.csv")
## Rows: 9912 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Name, Symbol, country
## dbl (3): Rank, pe_ratio_ttm, price (GBP)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Companies_ranked_by_Dividend_Yield <- load_and_clean_data("Companies_ranked_by_Dividend_Yield.csv")
## Rows: 9912 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Name, Symbol, country
## dbl (3): Rank, dividend_yield_ttm, price (GBP)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Merge all datasets based on common columns: Name, Symbol, and country
combined_data <- Companies_ranked_by_Revenue %>%
  inner_join(Companies_ranked_by_Earnings, by = c("Name", "Symbol", "country")) %>%
  inner_join(Companies_ranked_by_Market_Cap, by = c("Name", "Symbol", "country")) %>%
  inner_join(Companies_ranked_by_P_E_ratio, by = c("Name", "Symbol", "country")) %>%
  inner_join(Companies_ranked_by_Dividend_Yield, by = c("Name", "Symbol", "country")) %>%
  mutate(across(starts_with("ttm"), ~replace_na(., 0)))  # Replace NA with 0 for relevant columns

# Filter the top 15 companies based on revenue rank for focused analysis
top_15_companies <- combined_data %>%
  arrange(Rank.x) %>%
  slice(1:15)

# Summary of the numerical columns for overall analysis
summary_stats <- combined_data %>%
  summarize(
    total_companies = n(),
    avg_revenue = mean(revenue_ttm, na.rm = TRUE),
    avg_earnings = mean(earnings_ttm, na.rm = TRUE),
    avg_marketcap = mean(marketcap, na.rm = TRUE),
    avg_pe_ratio = mean(pe_ratio_ttm, na.rm = TRUE),
    avg_dividend_yield = mean(dividend_yield_ttm, na.rm = TRUE)
  )

# Print summary statistics to the console
print(summary_stats)
## # A tibble: 1 × 6
##   total_companies avg_revenue avg_earnings avg_marketcap avg_pe_ratio
##             <int>       <dbl>        <dbl>         <dbl>        <dbl>
## 1            9910 5707180612.   711244595.   8743628906.     -3.26e14
## # ℹ 1 more variable: avg_dividend_yield <dbl>
# Visualise Top 15 Companies by Revenue, Earnings, Market Cap, P/E Ratio, and Dividend Yield

# Bar plot: Top 15 Companies by Revenue
p1 <- ggplot(top_15_companies, aes(x = reorder(Name, -revenue_ttm), y = revenue_ttm / 1e9)) +
  geom_bar(stat = "identity", fill = "#3399ff") +
  labs(title = "Top 15 Companies by Revenue", x = "Company", y = "Revenue (£B)") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

# Bar plot: Top 15 Companies by Earnings
p2 <- ggplot(top_15_companies, aes(x = reorder(Name, -earnings_ttm), y = earnings_ttm / 1e9)) +
  geom_bar(stat = "identity", fill = "#339900") +
  labs(title = "Top 15 Companies by Earnings", x = "Company", y = "Earnings (£B)") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

# Bar plot: Top 15 Companies by Market Cap
p3 <- ggplot(top_15_companies, aes(x = reorder(Name, -marketcap), y = marketcap / 1e9)) +
  geom_bar(stat = "identity", fill = "#993300") +
  labs(title = "Top 15 Companies by Market Cap", x = "Company", y = "Market Cap (£B)") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

# Bar plot: Top 15 Companies by P/E Ratio
p4 <- ggplot(top_15_companies, aes(x = reorder(Name, -pe_ratio_ttm), y = pe_ratio_ttm)) +
  geom_bar(stat = "identity", fill = "#cc6600") +
  labs(title = "Top 15 Companies by P/E Ratio", x = "Company", y = "P/E Ratio") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

# Bar plot: Top 15 Companies by Dividend Yield
p5 <- ggplot(top_15_companies, aes(x = reorder(Name, -dividend_yield_ttm), y = dividend_yield_ttm)) +
  geom_bar(stat = "identity", fill = "#ff6699") +
  labs(title = "Top 15 Companies by Dividend Yield", x = "Company", y = "Dividend Yield (%)") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

# Correlation Analysis: Heatmap
correlation_matrix <- cor(top_15_companies %>% select(revenue_ttm, earnings_ttm, marketcap, pe_ratio_ttm, dividend_yield_ttm), use = "complete.obs")
melted_correlation <- melt(correlation_matrix)

p6 <- ggplot(melted_correlation, aes(Var1, Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1), space = "Lab", name="Correlation") +
  theme_minimal() +
  labs(title = "Correlation Heatmap of Financial Metrics") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))
  
# Arrange the plots in a grid layout
grid.arrange(p1, p2, p3, p4, p5, p6, nrow = 2, ncol = 3)

# Cross-Analysis: Scatter plots comparing the metrics

# Scatter plot: Revenue vs P/E Ratio
scatter1 <- ggplot(top_15_companies, aes(x = revenue_ttm / 1e9, y = pe_ratio_ttm, color = country)) +
  geom_point(size = 3) +
  labs(title = "Revenue vs P/E Ratio (Top 15 Companies)", x = "Revenue (£B)", y = "P/E Ratio") +
  theme()

# Scatter plot: Earnings vs Dividend Yield
scatter2 <- ggplot(top_15_companies, aes(x = earnings_ttm / 1e9, y = dividend_yield_ttm, color = country)) +
  geom_point(size = 3) +
  labs(title = "Earnings vs Dividend Yield (Top 15 Companies)", x = "Earnings (£B)", y = "Dividend Yield (%)") +
  theme()

# Scatter plot: Revenue vs Earnings
scatter3 <- ggplot(top_15_companies, aes(x = revenue_ttm / 1e9, y = earnings_ttm / 1e9, color = country)) +
  geom_point(size = 3) +
  labs(title = "Revenue vs Earnings (Top 15 Companies)", x = "Revenue (£B)", y = "Earnings (£B)") +
  theme()

# Scatter plot: Revenue vs Market Cap
scatter4 <- ggplot(top_15_companies, aes(x = revenue_ttm / 1e9, y = marketcap / 1e9, color = country)) +
  geom_point(size = 3) +
  labs(title = "Revenue vs Market Cap (Top 15 Companies)", x = "Revenue (£B)", y = "Market Cap (£B)") +
  theme()

# Arrange the 4 scatter plots into a 2x2 grid 
grid.arrange(scatter1, scatter2, scatter3, scatter4, nrow = 2)

# Country-Level Analysis: Average Revenue, P/E Ratio, and Dividend Yield by Country
country_summary <- combined_data %>%
  group_by(country) %>%
  summarize(
    avg_revenue = mean(revenue_ttm, na.rm = TRUE),
    avg_earnings = mean(earnings_ttm, na.rm = TRUE),
    avg_marketcap = mean(marketcap, na.rm = TRUE),
    avg_pe_ratio = mean(pe_ratio_ttm, na.rm = TRUE),
    avg_dividend_yield = mean(dividend_yield_ttm, na.rm = TRUE),
    company_count = n()
  ) %>%
  arrange(desc(avg_revenue))

# Bar Plot: Top 10 Countries by Average Dividend Yield
p7 <- ggplot(country_summary %>% top_n(10, avg_dividend_yield), aes(x = fct_reorder(country, avg_dividend_yield), y = avg_dividend_yield)) +
  geom_bar(stat = "identity", fill = "#ff6699") +
  labs(title = "Top 10 Countries by Average Dividend Yield", x = "Country", y = "Average Dividend Yield (%)") +
  theme() +
  coord_flip() +
  scale_y_continuous(labels = scales::comma)  

# Country-Level Analysis: Average Revenue, Earnings, and Market Cap by Country
country_summary <- combined_data %>%
  group_by(country) %>%
  summarize(
    avg_revenue = mean(revenue_ttm, na.rm = TRUE),
    avg_earnings = mean(earnings_ttm, na.rm = TRUE),
    avg_marketcap = mean(marketcap, na.rm = TRUE),
    company_count = n()
  ) %>%
  arrange(desc(avg_revenue))

# Bar Plot: Top 10 Countries by Average Revenue, Earnings, and Market Cap

# Top 10 Countries by Revenue
p8 <- ggplot(country_summary %>% top_n(10, avg_revenue), aes(x = fct_reorder(country, avg_revenue), y = avg_revenue / 1e9)) +
  geom_bar(stat = "identity", fill = "#3399ff") +
  labs(title = "Top 10 Countries by Average Revenue", x = "Country", y = "Average Revenue (£B)") +
  theme() +
  coord_flip()

# Top 10 Countries by Earnings
p9 <- ggplot(country_summary %>% top_n(10, avg_earnings), aes(x = fct_reorder(country, avg_earnings), y = avg_earnings / 1e9)) +
  geom_bar(stat = "identity", fill = "#339900") +
  labs(title = "Top 10 Countries by Average Earnings", x = "Country", y = "Average Earnings (£B)") +
  theme() +
  coord_flip()

# Top 10 Countries by Market Cap
p10 <- ggplot(country_summary %>% top_n(10, avg_marketcap), aes(x = fct_reorder(country, avg_marketcap), y = avg_marketcap / 1e9)) +
  geom_bar(stat = "identity", fill = "#993300") +
  labs(title = "Top 10 Countries by Average Market Cap", x = "Country", y = "Average Market Cap (£B)") +
  theme() +
  coord_flip()

# Arrange the plots in a grid layout
grid.arrange(p7, p8, p9, p10, nrow = 2, ncol = 2)

# Correlation Analysis: Revenue, Earnings, and Market Cap
correlation_revenue_earnings <- cor(top_15_companies$revenue_ttm, top_15_companies$earnings_ttm, use = "complete.obs")
correlation_revenue_marketcap <- cor(top_15_companies$revenue_ttm, top_15_companies$marketcap, use = "complete.obs")

print(paste("Correlation between Revenue and Earnings: ", round(correlation_revenue_earnings, 2)))
## [1] "Correlation between Revenue and Earnings:  0.17"
print(paste("Correlation between Revenue and Market Cap: ", round(correlation_revenue_marketcap, 2)))
## [1] "Correlation between Revenue and Market Cap:  0.25"
# Summarise total revenue by country and calculate percentages
revenue_by_country <- combined_data %>%
  group_by(country) %>%
  summarize(total_revenue = sum(revenue_ttm, na.rm = TRUE)) %>%
  arrange(desc(total_revenue)) %>%
  slice(1:10) %>%
  mutate(percentage = total_revenue / sum(total_revenue) * 100)

# Pie chart with percentage labels
p11 <- ggplot(revenue_by_country, aes(x = "", y = total_revenue, fill = country)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +
  labs(title = "Revenue Distribution by Country (Top 10)", y = "", x = "") +
  theme_void() +
  theme(legend.title = element_blank()) +
  scale_fill_brewer(palette = "Set3") +
  geom_text(aes(label = paste0(round(percentage, 1), "%")),
            position = position_stack(vjust = 0.5))  # Add percentage labels

# Calculate percentages for top 10 companies by market cap
top_10_marketcap <- combined_data %>%
  arrange(desc(marketcap)) %>%
  slice(1:10) %>%
  mutate(percentage = marketcap / sum(marketcap) * 100)

# Pie chart with percentage labels
p12 <- ggplot(top_10_marketcap, aes(x = "", y = marketcap, fill = Name)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +
  labs(title = "Market Capitalisation Distribution (Top 10 Companies)", y = "", x = "") +
  theme_void() +
  theme(legend.title = element_blank()) +
  scale_fill_brewer(palette = "Set3") +
  geom_text(aes(label = paste0(round(percentage, 1), "%")),
            position = position_stack(vjust = 0.5))  # Add percentage labels

# Arrange the plots in a grid layout
grid.arrange(p11, p12, ncol = 2)