Data Set Examination

install.packages("skimr")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
library(ggplot2) 
library(skimr)

# Load the data set 
my_data <- read.csv("ED.csv")

# Basic Summary
summary(my_data)

##    COUNTRY            Country               WEEK        Week.number   
##  Length:135628      Length:135628      Min.   : 1.00   Min.   : 1.00  
##  Class :character   Class :character   1st Qu.:13.00   1st Qu.:13.00  
##  Mode  :character   Mode  :character   Median :26.00   Median :26.00  
##                                        Mean   :25.78   Mean   :25.78  
##                                        3rd Qu.:38.00   3rd Qu.:38.00  
##                                        Max.   :53.00   Max.   :53.00  
##     GENDER             Gender              AGE                Age           
##  Length:135628      Length:135628      Length:135628      Length:135628     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    VARIABLE           Variable              YEAR           Year     
##  Length:135628      Length:135628      Min.   :2020   Min.   :2020  
##  Class :character   Class :character   1st Qu.:2020   1st Qu.:2020  
##  Mode  :character   Mode  :character   Median :2021   Median :2021  
##                                        Mean   :2021   Mean   :2021  
##                                        3rd Qu.:2022   3rd Qu.:2022  
##                                        Max.   :2023   Max.   :2023  
##      Value           Flag.Codes           Flags          
##  Min.   :-12788.8   Length:135628      Length:135628     
##  1st Qu.:    -4.6   Class :character   Class :character  
##  Median :     6.2   Mode  :character   Mode  :character  
##  Mean   :   116.9                                        
##  3rd Qu.:    29.4                                        
##  Max.   : 28262.9

# Extended Summary with 'skimr'
skim(my_data)

Data summary
Name	my_data
Number of rows	135628
Number of columns	15
_______________________
Column type frequency:
character	10
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	empty	n_unique
COUNTRY	1	3	3	0	34
Country	1	5	15	0	34
GENDER	1	4	6	0	3
Gender	1	5	7	0	3
AGE	1	5	6	0	4
Age	1	5	11	0	4
VARIABLE	1	8	8	0	2
Variable	1	22	37	0	2
Flag.Codes	1	0	1	134938	2
Flags	1	0	25	134938	2

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
WEEK	1	25.78	14.75	1.0	13.0	26.0	38.0	53.0	▇▇▇▇▆
Week.number	1	25.78	14.75	1.0	13.0	26.0	38.0	53.0	▇▇▇▇▆
YEAR	1	2021.41	1.10	2020.0	2020.0	2021.0	2022.0	2023.0	▇▇▁▇▆
Year	1	2021.41	1.10	2020.0	2020.0	2021.0	2022.0	2023.0	▇▇▁▇▆
Value	1	116.92	766.43	-12788.8	-4.6	6.2	29.4	28262.9	▁▇▁▁▁

# Missing Values
sum(is.na(my_data))

## [1] 0

colSums(is.na(my_data))

##     COUNTRY     Country        WEEK Week.number      GENDER      Gender 
##           0           0           0           0           0           0 
##         AGE         Age    VARIABLE    Variable        YEAR        Year 
##           0           0           0           0           0           0 
##       Value  Flag.Codes       Flags 
##           0           0           0

# Visualisations for Structure
str(my_data)

## 'data.frame':    135628 obs. of  15 variables:
##  $ COUNTRY    : chr  "CZE" "CZE" "CZE" "NLD" ...
##  $ Country    : chr  "Czechia" "Czechia" "Czechia" "Netherlands" ...
##  $ WEEK       : int  46 46 46 3 3 3 3 37 37 37 ...
##  $ Week.number: int  46 46 46 3 3 3 3 37 37 37 ...
##  $ GENDER     : chr  "TOTAL" "TOTAL" "TOTAL" "TOTAL" ...
##  $ Gender     : chr  "Total" "Total" "Total" "Total" ...
##  $ AGE        : chr  "Y0T44" "Y0T44" "Y0T44" "Y_GE65" ...
##  $ Age        : chr  "0 to 44" "0 to 44" "0 to 44" "65 and over" ...
##  $ VARIABLE   : chr  "EXCESSNB" "EXCESSNB" "EXCESSNB" "EXCESSNB" ...
##  $ Variable   : chr  "Excess deaths (number)" "Excess deaths (number)" "Excess deaths (number)" "Excess deaths (number)" ...
##  $ YEAR       : int  2020 2021 2022 2020 2021 2022 2023 2020 2021 2022 ...
##  $ Year       : int  2020 2021 2022 2020 2021 2022 2023 2020 2021 2022 ...
##  $ Value      : num  2.2 8.2 4.2 -127.4 569.6 ...
##  $ Flag.Codes : chr  "" "" "" "" ...
##  $ Flags      : chr  "" "" "" "" ...

# Histograms (for numeric columns) with bins = 30
my_data %>%
  select_if(is.numeric) %>%
  pivot_longer(cols = everything(), names_to = "key", values_to = "value") %>%
  ggplot(aes(value)) +
  geom_histogram(bins = 30) + 
  facet_wrap(~key, scales = 'free')

# Box plots (for numeric columns)
my_data %>%
  select_if(is.numeric) %>%
  pivot_longer(cols = everything(), names_to = "key", values_to = "value") %>% 
  ggplot(aes(key, value)) +
  geom_boxplot()

# Scatter plots (for pairs of numeric columns)
pairs(my_data[, sapply(my_data, is.numeric)])

Data Set Examination

Patrick Ford

2024-02-28