install.packages("skimr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(skimr)
# Load the data set
my_data <- read.csv("ED.csv")
# Basic Summary
summary(my_data)
## COUNTRY Country WEEK Week.number
## Length:135628 Length:135628 Min. : 1.00 Min. : 1.00
## Class :character Class :character 1st Qu.:13.00 1st Qu.:13.00
## Mode :character Mode :character Median :26.00 Median :26.00
## Mean :25.78 Mean :25.78
## 3rd Qu.:38.00 3rd Qu.:38.00
## Max. :53.00 Max. :53.00
## GENDER Gender AGE Age
## Length:135628 Length:135628 Length:135628 Length:135628
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## VARIABLE Variable YEAR Year
## Length:135628 Length:135628 Min. :2020 Min. :2020
## Class :character Class :character 1st Qu.:2020 1st Qu.:2020
## Mode :character Mode :character Median :2021 Median :2021
## Mean :2021 Mean :2021
## 3rd Qu.:2022 3rd Qu.:2022
## Max. :2023 Max. :2023
## Value Flag.Codes Flags
## Min. :-12788.8 Length:135628 Length:135628
## 1st Qu.: -4.6 Class :character Class :character
## Median : 6.2 Mode :character Mode :character
## Mean : 116.9
## 3rd Qu.: 29.4
## Max. : 28262.9
# Extended Summary with 'skimr'
skim(my_data)
Data summary
Name |
my_data |
Number of rows |
135628 |
Number of columns |
15 |
_______________________ |
|
Column type frequency: |
|
character |
10 |
numeric |
5 |
________________________ |
|
Group variables |
None |
Variable type: character
COUNTRY |
0 |
1 |
3 |
3 |
0 |
34 |
0 |
Country |
0 |
1 |
5 |
15 |
0 |
34 |
0 |
GENDER |
0 |
1 |
4 |
6 |
0 |
3 |
0 |
Gender |
0 |
1 |
5 |
7 |
0 |
3 |
0 |
AGE |
0 |
1 |
5 |
6 |
0 |
4 |
0 |
Age |
0 |
1 |
5 |
11 |
0 |
4 |
0 |
VARIABLE |
0 |
1 |
8 |
8 |
0 |
2 |
0 |
Variable |
0 |
1 |
22 |
37 |
0 |
2 |
0 |
Flag.Codes |
0 |
1 |
0 |
1 |
134938 |
2 |
0 |
Flags |
0 |
1 |
0 |
25 |
134938 |
2 |
0 |
Variable type: numeric
WEEK |
0 |
1 |
25.78 |
14.75 |
1.0 |
13.0 |
26.0 |
38.0 |
53.0 |
▇▇▇▇▆ |
Week.number |
0 |
1 |
25.78 |
14.75 |
1.0 |
13.0 |
26.0 |
38.0 |
53.0 |
▇▇▇▇▆ |
YEAR |
0 |
1 |
2021.41 |
1.10 |
2020.0 |
2020.0 |
2021.0 |
2022.0 |
2023.0 |
▇▇▁▇▆ |
Year |
0 |
1 |
2021.41 |
1.10 |
2020.0 |
2020.0 |
2021.0 |
2022.0 |
2023.0 |
▇▇▁▇▆ |
Value |
0 |
1 |
116.92 |
766.43 |
-12788.8 |
-4.6 |
6.2 |
29.4 |
28262.9 |
▁▇▁▁▁ |
# Missing Values
sum(is.na(my_data))
## [1] 0
colSums(is.na(my_data))
## COUNTRY Country WEEK Week.number GENDER Gender
## 0 0 0 0 0 0
## AGE Age VARIABLE Variable YEAR Year
## 0 0 0 0 0 0
## Value Flag.Codes Flags
## 0 0 0
# Visualisations for Structure
str(my_data)
## 'data.frame': 135628 obs. of 15 variables:
## $ COUNTRY : chr "CZE" "CZE" "CZE" "NLD" ...
## $ Country : chr "Czechia" "Czechia" "Czechia" "Netherlands" ...
## $ WEEK : int 46 46 46 3 3 3 3 37 37 37 ...
## $ Week.number: int 46 46 46 3 3 3 3 37 37 37 ...
## $ GENDER : chr "TOTAL" "TOTAL" "TOTAL" "TOTAL" ...
## $ Gender : chr "Total" "Total" "Total" "Total" ...
## $ AGE : chr "Y0T44" "Y0T44" "Y0T44" "Y_GE65" ...
## $ Age : chr "0 to 44" "0 to 44" "0 to 44" "65 and over" ...
## $ VARIABLE : chr "EXCESSNB" "EXCESSNB" "EXCESSNB" "EXCESSNB" ...
## $ Variable : chr "Excess deaths (number)" "Excess deaths (number)" "Excess deaths (number)" "Excess deaths (number)" ...
## $ YEAR : int 2020 2021 2022 2020 2021 2022 2023 2020 2021 2022 ...
## $ Year : int 2020 2021 2022 2020 2021 2022 2023 2020 2021 2022 ...
## $ Value : num 2.2 8.2 4.2 -127.4 569.6 ...
## $ Flag.Codes : chr "" "" "" "" ...
## $ Flags : chr "" "" "" "" ...
# Histograms (for numeric columns) with bins = 30
my_data %>%
select_if(is.numeric) %>%
pivot_longer(cols = everything(), names_to = "key", values_to = "value") %>%
ggplot(aes(value)) +
geom_histogram(bins = 30) +
facet_wrap(~key, scales = 'free')

# Box plots (for numeric columns)
my_data %>%
select_if(is.numeric) %>%
pivot_longer(cols = everything(), names_to = "key", values_to = "value") %>%
ggplot(aes(key, value)) +
geom_boxplot()

# Scatter plots (for pairs of numeric columns)
pairs(my_data[, sapply(my_data, is.numeric)])
