📊 Descriptive Statistics

numeric_data <- dplyr::select(uk_tourism, where(is.numeric))
desc_stats <- psych::describe(numeric_data)

desc_stats %>%
  dplyr::select(mean, sd, min, max, skew, kurtosis) %>%
  round(2) %>%
  knitr::kable("html", caption = "Descriptive Statistics of UK Tourism Data") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Descriptive Statistics of UK Tourism Data
mean sd min max skew kurtosis
Year 2016.50 4.18 2010.00 2023.00 0.00 -1.46
Total_Visitors 30.09 10.23 6.20 40.90 -1.17 0.18
Visitor_Expenditure 18.72 6.74 3.90 28.40 -0.93 -0.10
Expenditure_Per_Visitor 618.52 42.10 558.56 707.24 0.67 -0.31
Exchange_Rate_USD 1.43 0.14 1.24 1.65 0.21 -1.77
Exchange_Rate_EUR 1.19 0.07 1.13 1.38 1.65 2.26
Purpose_Holiday 11.66 4.06 2.00 16.00 -1.27 0.37
Purpose_Business 6.63 2.80 0.50 9.20 -1.10 -0.32
Purpose_VFR 9.42 2.61 3.30 12.70 -1.03 0.19
Purpose_Other 2.38 0.98 0.40 3.60 -0.56 -0.81
Brexit_Period 0.57 0.51 0.00 1.00 -0.26 -2.07
COVID_Period 0.14 0.36 0.00 1.00 1.83 1.45

🧪 Hypothesis 1: Visitor Expenditure ➜ Total Visitors

cor_h1 <- cor.test(uk_tourism$Visitor_Expenditure, uk_tourism$Total_Visitors)
model_h1 <- lm(Total_Visitors ~ Visitor_Expenditure, data = uk_tourism)
summary_h1 <- summary(model_h1)

h1_table <- data.frame(
  Metric = c("Correlation", "Correlation p-value", "Regression Slope", "Regression p-value", "R-squared"),
  Value = c(
    round(cor_h1$estimate, 4),
    round(cor_h1$p.value, 4),
    round(summary_h1$coefficients[2, 1], 4),
    round(summary_h1$coefficients[2, 4], 4),
    round(summary_h1$r.squared, 4)
  )
)
kable(h1_table, caption = "Hypothesis 1: Expenditure Impact Results")
Hypothesis 1: Expenditure Impact Results
Metric Value
Correlation 0.9814
Correlation p-value 0.0000
Regression Slope 1.4896
Regression p-value 0.0000
R-squared 0.9632
ggplot(uk_tourism, aes(x = Visitor_Expenditure, y = Total_Visitors)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Visitor Expenditure vs Total Visitors")
## `geom_smooth()` using formula = 'y ~ x'


💱 Hypothesis 2: Exchange Rate ➜ Total Visitors

cor_usd <- cor.test(uk_tourism$Exchange_Rate_USD, uk_tourism$Total_Visitors)
model_usd <- lm(Total_Visitors ~ Exchange_Rate_USD, data = uk_tourism)
cor_eur <- cor.test(uk_tourism$Exchange_Rate_EUR, uk_tourism$Total_Visitors)
model_eur <- lm(Total_Visitors ~ Exchange_Rate_EUR, data = uk_tourism)

h2_usd <- data.frame(
  Metric = c("Correlation", "p-value", "Slope", "Reg. p", "R-squared"),
  USD = c(
    round(cor_usd$estimate, 4),
    round(cor_usd$p.value, 4),
    round(coef(model_usd)[2], 4),
    round(summary(model_usd)$coefficients[2, 4], 4),
    round(summary(model_usd)$r.squared, 4)
  )
)

h2_eur <- data.frame(
  Metric = c("Correlation", "p-value", "Slope", "Reg. p", "R-squared"),
  EUR = c(
    round(cor_eur$estimate, 4),
    round(cor_eur$p.value, 4),
    round(coef(model_eur)[2], 4),
    round(summary(model_eur)$coefficients[2, 4], 4),
    round(summary(model_eur)$r.squared, 4)
  )
)
kable(h2_usd, caption = "Hypothesis 2: USD Exchange Rate Results")
Hypothesis 2: USD Exchange Rate Results
Metric USD
Correlation 0.1194
p-value 0.6843
Slope 8.5452
Reg. p 0.6843
R-squared 0.0143
kable(h2_eur, caption = "Hypothesis 2: EUR Exchange Rate Results")
Hypothesis 2: EUR Exchange Rate Results
Metric EUR
Correlation 0.2204
p-value 0.4489
Slope 33.8461
Reg. p 0.4489
R-squared 0.0486
uk_tourism %>%
  pivot_longer(cols = c(Exchange_Rate_USD, Exchange_Rate_EUR),
               names_to = "Currency", values_to = "Rate") %>%
  ggplot(aes(x = Year, y = Rate, color = Currency)) +
  geom_line(size = 1) + geom_point() +
  labs(title = "Exchange Rates Over Time")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.


🎯 Hypothesis 3: Purpose of Visit ➜ Visitor Variation

purpose_data <- uk_tourism %>%
  pivot_longer(cols = c(Purpose_Holiday, Purpose_Business, Purpose_VFR, Purpose_Other),
               names_to = "Purpose", values_to = "Visitors") %>%
  mutate(Purpose = gsub("Purpose_", "", Purpose))

anova_model <- aov(Visitors ~ Purpose, data = purpose_data)
tukey_result <- TukeyHSD(anova_model)

kable(summary(anova_model)[[1]], caption = "ANOVA Results for Purpose of Visit")
ANOVA Results for Purpose of Visit
Df Sum Sq Mean Sq F value Pr(>F)
Purpose 3 672.2720 224.09065 27.90081 0
Residuals 52 417.6479 8.03169 NA NA
kable(tukey_result$Purpose, caption = "Tukey HSD Post-Hoc Test")
Tukey HSD Post-Hoc Test
diff lwr upr p adj
Holiday-Business 5.035714 2.1927492 7.8786794 0.0001118
Other-Business -4.250000 -7.0929651 -1.4070349 0.0012438
VFR-Business 2.792857 -0.0501079 5.6358222 0.0558871
Other-Holiday -9.285714 -12.1286794 -6.4427492 0.0000000
VFR-Holiday -2.242857 -5.0858222 0.6001079 0.1686217
VFR-Other 7.042857 4.1998921 9.8858222 0.0000001
ggplot(purpose_data, aes(x = Purpose, y = Visitors, fill = Purpose)) +
  geom_boxplot() +
  labs(title = "Visitor Counts by Purpose of Visit")


🚧 Hypothesis 4: Travel Restrictions ➜ Visitor Drop

model_h4 <- lm(Total_Visitors ~ Travel_Restrictions, data = uk_tourism)
summary_h4 <- summary(model_h4)

# Check available coefficient names
available_coefs <- rownames(summary_h4$coefficients)

# Create a safe table even if some levels are missing
h4_table <- data.frame(
  Coefficient = c("Restricted", "Banned"),
  Estimate = c(
    if ("Travel_RestrictionsRestricted" %in% available_coefs)
      round(summary_h4$coefficients["Travel_RestrictionsRestricted", 1], 4)
    else NA,
    
    if ("Travel_RestrictionsBanned" %in% available_coefs)
      round(summary_h4$coefficients["Travel_RestrictionsBanned", 1], 4)
    else NA
  )
)

kable(h4_table, caption = "Hypothesis 4: Travel Restrictions Coefficients")
Hypothesis 4: Travel Restrictions Coefficients
Coefficient Estimate
Restricted 3.55
Banned NA
ggplot(uk_tourism, aes(x = Travel_Restrictions, y = Total_Visitors, fill = Travel_Restrictions)) +
  geom_boxplot() +
  labs(title = "Total Visitors by Travel Restriction Level")