# ==============================================================================
# 1. Descriptive Statistics: Dependent Variable by Key Independent Variable
# ==============================================================================
library(dplyr)

# Calculate the mean and median of annual electricity consumption (Econs2012) 
# grouped by the Energy Efficiency Band (EE_BAND).
dv_by_iv_summary <- need_clean %>%
  group_by(EE_BAND) %>%
  summarise(
    mean_econs = mean(Econs2012, na.rm = TRUE),
    median_econs = median(Econs2012, na.rm = TRUE)
  )

# Display the initial answer to the Research Question
print("Average and median electricity consumption by EPC Band:")
## [1] "Average and median electricity consumption by EPC Band:"
dv_by_iv_summary
## # A tibble: 6 × 3
##   EE_BAND mean_econs median_econs
##   <ord>        <dbl>        <dbl>
## 1 A/B          2375.         1900
## 2 C            3055.         2600
## 3 D            3511.         3050
## 4 E            3859.         3300
## 5 F            4382.         3650
## 6 G            4242.         3350
# ==============================================================================
# 2. Descriptive Statistics: Control Variables by Key Independent Variable
# ==============================================================================
# We use a cross-tabulation to examine the rival explanations.
# prop.table(2) calculates the column proportions. We round to 4 decimal places 
# and multiply by 100 to present the output as percentages.

# --- Cross-tabulation 1: Property Type by EE_BAND ---
type_by_band <- table(need_clean$PROP_TYPE, need_clean$EE_BAND) %>% 
  prop.table(2) %>% 
  round(4) * 100

print("Percentage of Property Types within each EPC Band:")
## [1] "Percentage of Property Types within each EPC Band:"
type_by_band
##                         
##                            A/B     C     D     E     F     G
##   Detached                3.41  9.35 15.36 15.28 26.03 18.78
##   Semi-detached           6.44 18.77 26.45 31.39 28.65 26.85
##   End terrace             3.78  8.94  9.56 13.34 14.74 14.95
##   Mid terrace            10.59 22.28 23.96 21.74 15.91 29.04
##   Bungalow                2.91  6.36 10.78  9.60  8.84  5.27
##   Flat (inc. maisonette) 72.86 34.30 13.90  8.65  5.82  5.10
# --- Cross-tabulation 2: Property Age by EE_BAND ---
age_by_band <- table(need_clean$PROP_AGE, need_clean$EE_BAND) %>% 
  prop.table(2) %>% 
  round(4) * 100

print("Percentage of Property Age categories within each EPC Band:")
## [1] "Percentage of Property Age categories within each EPC Band:"
age_by_band
##               
##                  A/B     C     D     E     F     G
##   Before 1930   1.99  8.95 25.76 45.64 60.60 62.04
##   1930–1949     2.15  9.32 17.38 22.90 21.29 22.45
##   1950–1966    10.44 19.52 19.64 16.18 11.84 11.48
##   1967–1982    19.87 24.38 20.25 12.68  5.85  3.44
##   1983–1995    12.04 15.23 12.23  2.42  0.32  0.46
##   1996 onwards 53.51 22.61  4.74  0.18  0.10  0.13
# --- Cross-tabulation 3: Wall Construction by EE_BAND ---
wall_by_band <- table(need_clean$WALL_CONS, need_clean$EE_BAND) %>% 
  prop.table(2) %>% 
  round(4) * 100

print("Percentage of Wall Construction types within each EPC Band:")
## [1] "Percentage of Wall Construction types within each EPC Band:"
wall_by_band
##              
##                 A/B     C     D     E     F     G
##   Cavity wall 77.67 81.32 66.65 43.96 30.45 29.85
##   Other       22.33 18.68 33.35 56.04 69.55 70.15
# Note: You can replicate the same syntax for other control variables 
# such as REGION or FLOOR_AREA_BAND if needed.