install.packages("skimr") 
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2) 
library(skimr)

# Load the data set 
my_data <- read.csv("ED.csv")

# Basic Summary
summary(my_data)
##    COUNTRY            Country               WEEK        Week.number   
##  Length:135628      Length:135628      Min.   : 1.00   Min.   : 1.00  
##  Class :character   Class :character   1st Qu.:13.00   1st Qu.:13.00  
##  Mode  :character   Mode  :character   Median :26.00   Median :26.00  
##                                        Mean   :25.78   Mean   :25.78  
##                                        3rd Qu.:38.00   3rd Qu.:38.00  
##                                        Max.   :53.00   Max.   :53.00  
##     GENDER             Gender              AGE                Age           
##  Length:135628      Length:135628      Length:135628      Length:135628     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    VARIABLE           Variable              YEAR           Year     
##  Length:135628      Length:135628      Min.   :2020   Min.   :2020  
##  Class :character   Class :character   1st Qu.:2020   1st Qu.:2020  
##  Mode  :character   Mode  :character   Median :2021   Median :2021  
##                                        Mean   :2021   Mean   :2021  
##                                        3rd Qu.:2022   3rd Qu.:2022  
##                                        Max.   :2023   Max.   :2023  
##      Value           Flag.Codes           Flags          
##  Min.   :-12788.8   Length:135628      Length:135628     
##  1st Qu.:    -4.6   Class :character   Class :character  
##  Median :     6.2   Mode  :character   Mode  :character  
##  Mean   :   116.9                                        
##  3rd Qu.:    29.4                                        
##  Max.   : 28262.9
# Extended Summary with 'skimr'
skim(my_data)
Data summary
Name my_data
Number of rows 135628
Number of columns 15
_______________________
Column type frequency:
character 10
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
COUNTRY 0 1 3 3 0 34 0
Country 0 1 5 15 0 34 0
GENDER 0 1 4 6 0 3 0
Gender 0 1 5 7 0 3 0
AGE 0 1 5 6 0 4 0
Age 0 1 5 11 0 4 0
VARIABLE 0 1 8 8 0 2 0
Variable 0 1 22 37 0 2 0
Flag.Codes 0 1 0 1 134938 2 0
Flags 0 1 0 25 134938 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
WEEK 0 1 25.78 14.75 1.0 13.0 26.0 38.0 53.0 ▇▇▇▇▆
Week.number 0 1 25.78 14.75 1.0 13.0 26.0 38.0 53.0 ▇▇▇▇▆
YEAR 0 1 2021.41 1.10 2020.0 2020.0 2021.0 2022.0 2023.0 ▇▇▁▇▆
Year 0 1 2021.41 1.10 2020.0 2020.0 2021.0 2022.0 2023.0 ▇▇▁▇▆
Value 0 1 116.92 766.43 -12788.8 -4.6 6.2 29.4 28262.9 ▁▇▁▁▁
# Missing Values
sum(is.na(my_data)) 
## [1] 0
colSums(is.na(my_data))
##     COUNTRY     Country        WEEK Week.number      GENDER      Gender 
##           0           0           0           0           0           0 
##         AGE         Age    VARIABLE    Variable        YEAR        Year 
##           0           0           0           0           0           0 
##       Value  Flag.Codes       Flags 
##           0           0           0
# Visualisations for Structure
str(my_data) 
## 'data.frame':    135628 obs. of  15 variables:
##  $ COUNTRY    : chr  "CZE" "CZE" "CZE" "NLD" ...
##  $ Country    : chr  "Czechia" "Czechia" "Czechia" "Netherlands" ...
##  $ WEEK       : int  46 46 46 3 3 3 3 37 37 37 ...
##  $ Week.number: int  46 46 46 3 3 3 3 37 37 37 ...
##  $ GENDER     : chr  "TOTAL" "TOTAL" "TOTAL" "TOTAL" ...
##  $ Gender     : chr  "Total" "Total" "Total" "Total" ...
##  $ AGE        : chr  "Y0T44" "Y0T44" "Y0T44" "Y_GE65" ...
##  $ Age        : chr  "0 to 44" "0 to 44" "0 to 44" "65 and over" ...
##  $ VARIABLE   : chr  "EXCESSNB" "EXCESSNB" "EXCESSNB" "EXCESSNB" ...
##  $ Variable   : chr  "Excess deaths (number)" "Excess deaths (number)" "Excess deaths (number)" "Excess deaths (number)" ...
##  $ YEAR       : int  2020 2021 2022 2020 2021 2022 2023 2020 2021 2022 ...
##  $ Year       : int  2020 2021 2022 2020 2021 2022 2023 2020 2021 2022 ...
##  $ Value      : num  2.2 8.2 4.2 -127.4 569.6 ...
##  $ Flag.Codes : chr  "" "" "" "" ...
##  $ Flags      : chr  "" "" "" "" ...
# Histograms (for numeric columns) with bins = 30
my_data %>%
  select_if(is.numeric) %>%
  pivot_longer(cols = everything(), names_to = "key", values_to = "value") %>%
  ggplot(aes(value)) +
  geom_histogram(bins = 30) + 
  facet_wrap(~key, scales = 'free')

# Box plots (for numeric columns)
my_data %>%
  select_if(is.numeric) %>%
  pivot_longer(cols = everything(), names_to = "key", values_to = "value") %>% 
  ggplot(aes(key, value)) +
  geom_boxplot()

# Scatter plots (for pairs of numeric columns)
pairs(my_data[, sapply(my_data, is.numeric)])