Продолжение
Данные Всемирной организации здравоохранения (ВОЗ) будут использованы в качестве иллюстрации к положениям, изложенным в книге Garrett Grolemund и Hadley Wickham “R for Data Science”. Web-версия в свободном доступе
Файл с данными, используемый для изучения материалов книги, собран с использованием пакета WHO. Создание файла описано ранее
library(tidyverse)
WHOData <- read.csv("WHOData.csv")
glimpse(WHOData)
## Observations: 181
## Variables: 10
## $ country <fctr> Bosnia and Herzegovina, Botsw...
## $ worldbankincomegroup <fctr> Upper-middle-income, Upper-mi...
## $ PerCapitaTotalHealthExpenditure <dbl> 957.40, 870.84, 6468.50, 472.6...
## $ region <fctr> Europe, Africa, Europe, Weste...
## $ LifeExpectancy <dbl> 77.4, 65.7, 83.4, 69.4, 70.9, ...
## $ AdolescentBirthRate <dbl> 11.0, 39.0, 2.0, 32.6, 56.0, 1...
## $ ContraceptivePrevalence <dbl> 45.8, 52.8, 82.0, NA, 60.3, 63...
## $ LowBirthWeight <int> 5, 13, 6, 18, 13, 4, 14, 27, 5...
## $ LiteracyRate <int> 98, 85, NA, NA, 74, 100, 61, N...
## $ TotalFertilityRate <dbl> 1.3, 2.6, 1.5, 3.3, 2.8, 1.6, ...
# преобразуем тип переменной country
WHOData$country <- as.character(WHOData$country)
# определим порядок в переменной
WHOData$worldbankincomegroup <- factor(WHOData$worldbankincomegroup,
levels = c("Low-income", "Lower-middle-income", "Upper-middle-income", "High-income"))
summary(WHOData)
## country worldbankincomegroup
## Length:181 Low-income :29
## Class :character Lower-middle-income:49
## Mode :character Upper-middle-income:48
## High-income :55
##
##
##
## PerCapitaTotalHealthExpenditure region
## Min. : 24.96 Africa :47
## 1st Qu.: 202.16 Americas :33
## Median : 698.30 Eastern Mediterranean:20
## Mean :1269.48 Europe :50
## 3rd Qu.:1718.02 South-East Asia :10
## Max. :9402.54 Western Pacific :21
##
## LifeExpectancy AdolescentBirthRate ContraceptivePrevalence
## Min. :50.10 Min. : 1.70 Min. : 4.00
## 1st Qu.:65.70 1st Qu.: 17.00 1st Qu.:34.40
## Median :73.50 Median : 45.70 Median :54.80
## Mean :71.38 Mean : 56.77 Mean :51.21
## 3rd Qu.:76.70 3rd Qu.: 84.00 3rd Qu.:70.30
## Max. :83.70 Max. :229.00 Max. :88.40
## NA's :16
## LowBirthWeight LiteracyRate TotalFertilityRate
## Min. : 3.00 Min. : 29.00 Min. :1.300
## 1st Qu.: 6.00 1st Qu.: 71.00 1st Qu.:1.800
## Median : 9.00 Median : 90.00 Median :2.300
## Mean :10.52 Mean : 83.36 Mean :2.863
## 3rd Qu.:13.00 3rd Qu.: 98.00 3rd Qu.:3.800
## Max. :34.00 Max. :100.00 Max. :7.600
## NA's :7 NA's :50
Пакет dplyr, включенный в пакет tidyverse, в качестве основных функций, используемых для преобразования данных, предлагает:
При использовании этих и других функций предлагается широко использовать оператор %>% (pipe).
Позволяет проводить выбор строк, соответствующих заданным условиям.
Например,
Выбор стран региона Юго-Восточной Азии
WHOData %>% filter(region == "South-East Asia") %>% tbl_df()
## # A tibble: 10 × 10
## country worldbankincomegroup PerCapitaTotalHealthExpenditure
## <chr> <fctr> <dbl>
## 1 Sri Lanka Lower-middle-income 369.17
## 2 Thailand Upper-middle-income 950.14
## 3 India Lower-middle-income 267.41
## 4 Bhutan Lower-middle-income 281.10
## 5 Myanmar Lower-middle-income 103.47
## 6 Timor-Leste Lower-middle-income 101.54
## 7 Maldives Upper-middle-income 1995.84
## 8 Nepal Low-income 137.40
## 9 Bangladesh Lower-middle-income 88.08
## 10 Indonesia Lower-middle-income 299.41
## # ... with 7 more variables: region <fctr>, LifeExpectancy <dbl>,
## # AdolescentBirthRate <dbl>, ContraceptivePrevalence <dbl>,
## # LowBirthWeight <int>, LiteracyRate <int>, TotalFertilityRate <dbl>
Выбор европейских стран с доходом ниже среднего
WHOData %>% filter(region == "Europe",
worldbankincomegroup == "Lower-middle-income") %>% tbl_df()
## # A tibble: 7 × 10
## country worldbankincomegroup PerCapitaTotalHealthExpenditure
## <chr> <fctr> <dbl>
## 1 Georgia Lower-middle-income 627.74
## 2 Uzbekistan Lower-middle-income 339.61
## 3 Kyrgyzstan Lower-middle-income 215.06
## 4 Ukraine Lower-middle-income 584.24
## 5 Republic of Moldova Lower-middle-income 514.21
## 6 Armenia Lower-middle-income 362.13
## 7 Tajikistan Lower-middle-income 185.15
## # ... with 7 more variables: region <fctr>, LifeExpectancy <dbl>,
## # AdolescentBirthRate <dbl>, ContraceptivePrevalence <dbl>,
## # LowBirthWeight <int>, LiteracyRate <int>, TotalFertilityRate <dbl>
Выбор стран региона ВОЗ Восточное Средиземноморье, входящих в группы стран с низкими доходами или доходами ниже средних.
WHOData %>% filter(region == "Eastern Mediterranean",
(worldbankincomegroup == "Lower-middle-income" |
worldbankincomegroup == "Low-income")
) %>% tbl_df()
## # A tibble: 8 × 10
## country worldbankincomegroup
## <chr> <fctr>
## 1 Egypt Lower-middle-income
## 2 Pakistan Lower-middle-income
## 3 Morocco Lower-middle-income
## 4 Syrian Arab Republic Lower-middle-income
## 5 Djibouti Lower-middle-income
## 6 Afghanistan Low-income
## 7 Yemen Lower-middle-income
## 8 Sudan Lower-middle-income
## # ... with 8 more variables: PerCapitaTotalHealthExpenditure <dbl>,
## # region <fctr>, LifeExpectancy <dbl>, AdolescentBirthRate <dbl>,
## # ContraceptivePrevalence <dbl>, LowBirthWeight <int>,
## # LiteracyRate <int>, TotalFertilityRate <dbl>
# или это можно сделать вот так
WHOData %>% filter(region == "Eastern Mediterranean",
worldbankincomegroup %in%
c("Lower-middle-income", "Low-income")) %>% tbl_df()
## # A tibble: 8 × 10
## country worldbankincomegroup
## <chr> <fctr>
## 1 Egypt Lower-middle-income
## 2 Pakistan Lower-middle-income
## 3 Morocco Lower-middle-income
## 4 Syrian Arab Republic Lower-middle-income
## 5 Djibouti Lower-middle-income
## 6 Afghanistan Low-income
## 7 Yemen Lower-middle-income
## 8 Sudan Lower-middle-income
## # ... with 8 more variables: PerCapitaTotalHealthExpenditure <dbl>,
## # region <fctr>, LifeExpectancy <dbl>, AdolescentBirthRate <dbl>,
## # ContraceptivePrevalence <dbl>, LowBirthWeight <int>,
## # LiteracyRate <int>, TotalFertilityRate <dbl>
Выбор стран региона Африка, кроме входящих в группу с низкими доходами
WHOData %>% filter(region == "Africa",
worldbankincomegroup != "Low-income") %>% tbl_df()
## # A tibble: 22 × 10
## country worldbankincomegroup
## <chr> <fctr>
## 1 Botswana Upper-middle-income
## 2 Algeria Upper-middle-income
## 3 Namibia Upper-middle-income
## 4 Angola Upper-middle-income
## 5 Cameroon Lower-middle-income
## 6 Cabo Verde Lower-middle-income
## 7 Lesotho Lower-middle-income
## 8 Senegal Lower-middle-income
## 9 Sao Tome and Principe Lower-middle-income
## 10 South Africa Upper-middle-income
## # ... with 12 more rows, and 8 more variables:
## # PerCapitaTotalHealthExpenditure <dbl>, region <fctr>,
## # LifeExpectancy <dbl>, AdolescentBirthRate <dbl>,
## # ContraceptivePrevalence <dbl>, LowBirthWeight <int>,
## # LiteracyRate <int>, TotalFertilityRate <dbl>
Позволяет отбирать нужные столбцы (переменные), которые будут использоваться далее в расчетах или графиках
WHOData %>% select(country, worldbankincomegroup, TotalFertilityRate, LiteracyRate) %>%
ggplot(aes(LiteracyRate, TotalFertilityRate, colour = worldbankincomegroup)) + geom_point()
## Warning: Removed 50 rows containing missing values (geom_point).
WHOData %>% select(country, worldbankincomegroup, ContraceptivePrevalence, LiteracyRate) %>%
ggplot(aes(LiteracyRate, ContraceptivePrevalence,
colour = worldbankincomegroup)) + geom_point()
## Warning: Removed 55 rows containing missing values (geom_point).
Позволяет сортировать данные в столбцах дата фрейма
# в порядке возрастания
WHOData %>% select(country, TotalFertilityRate, LiteracyRate) %>%
arrange(LiteracyRate) %>% tbl_df()
## # A tibble: 181 × 3
## country TotalFertilityRate LiteracyRate
## <chr> <dbl> <int>
## 1 Burkina Faso 5.6 29
## 2 Mali 6.8 33
## 3 Chad 6.3 35
## 4 Ethiopia 4.5 39
## 5 Guinea 4.9 41
## 6 Benin 4.8 42
## 7 Sierra Leone 4.7 43
## 8 Senegal 4.9 50
## 9 Gambia 5.8 51
## 10 Pakistan 3.2 55
## # ... with 171 more rows
# в порядке убывания
WHOData %>% select(country, TotalFertilityRate, LiteracyRate) %>%
arrange(-TotalFertilityRate) %>% tbl_df()
## # A tibble: 181 × 3
## country TotalFertilityRate LiteracyRate
## <chr> <dbl> <int>
## 1 Niger 7.6 NA
## 2 Mali 6.8 33
## 3 Chad 6.3 35
## 4 Nigeria 6.0 61
## 5 Burundi 6.0 67
## 6 Democratic Republic of the Congo 5.9 67
## 7 Angola 5.9 70
## 8 Timor-Leste 5.9 58
## 9 Uganda 5.9 73
## 10 Gambia 5.8 51
## # ... with 171 more rows
Позволяет создавать новые переменные
В используемом дата фрейме многие переменные выражены в процентах. Преобразуем значения переменной LiteracyRate из процентов в доли
WHOData %>% select(country, LiteracyRate) %>%
mutate(LiteracyRate.Part = LiteracyRate * .01) %>%
arrange(LiteracyRate) %>% tbl_df()
## # A tibble: 181 × 3
## country LiteracyRate LiteracyRate.Part
## <chr> <int> <dbl>
## 1 Burkina Faso 29 0.29
## 2 Mali 33 0.33
## 3 Chad 35 0.35
## 4 Ethiopia 39 0.39
## 5 Guinea 41 0.41
## 6 Benin 42 0.42
## 7 Sierra Leone 43 0.43
## 8 Senegal 50 0.50
## 9 Gambia 51 0.51
## 10 Pakistan 55 0.55
## # ... with 171 more rows
Используя функцию mutate() можно создать переменную, указывающую на ранг значения
WHOData %>% select(country, LiteracyRate) %>%
mutate(LiteracyRate.Rank = row_number(LiteracyRate)) %>%
tbl_df()
## # A tibble: 181 × 3
## country LiteracyRate LiteracyRate.Rank
## <chr> <int> <int>
## 1 Bosnia and Herzegovina 98 98
## 2 Botswana 85 46
## 3 Switzerland NA NA
## 4 Micronesia (Federated States of) NA NA
## 5 Egypt 74 37
## 6 Estonia 100 117
## 7 Malawi 61 20
## 8 Niger NA NA
## 9 Norway NA NA
## 10 Oman 87 52
## # ... with 171 more rows
Функция mutate() позволяет создать номинативные переменные из количественных.
Replacement rate для Total Fertility Rate составляет 2,33 - Википедия
WHOData %>% select(country, TotalFertilityRate) %>%
mutate(TotalFertRate.Rank = ifelse(TotalFertilityRate < 2.33, "BelowReplacememt", "Replacement")) %>%
tbl_df()
## # A tibble: 181 × 3
## country TotalFertilityRate TotalFertRate.Rank
## <chr> <dbl> <chr>
## 1 Bosnia and Herzegovina 1.3 BelowReplacememt
## 2 Botswana 2.6 Replacement
## 3 Switzerland 1.5 BelowReplacememt
## 4 Micronesia (Federated States of) 3.3 Replacement
## 5 Egypt 2.8 Replacement
## 6 Estonia 1.6 BelowReplacememt
## 7 Malawi 5.4 Replacement
## 8 Niger 7.6 Replacement
## 9 Norway 1.9 BelowReplacememt
## 10 Oman 2.9 Replacement
## # ... with 171 more rows
WHOData %>% select(region, TotalFertilityRate) %>%
mutate(TotalFertilityRate.Rank = ifelse(TotalFertilityRate < 2.33, "BelowReplacememt", "Replacement")) %>%
ggplot(aes(region, fill = TotalFertilityRate.Rank)) + geom_bar() +
coord_flip()
Но полученные результаты не очень точны, поскольку показатель Replacement rate не является единым для всех стран. Значение 2.33 выбрано как среднее.
Объединим записи в группы по продолжительности жизни с шагом в 10 лет и представим в виде таблицы и графика
WHOData %>% select(country, LifeExpectancy) %>%
mutate(LifeExpectancy.Group = cut(LifeExpectancy,
breaks = seq(50, 90, 10),
labels = c("less 60", "60-70",
"70-80", "more 80"))) %>%
tbl_df()
## # A tibble: 181 × 3
## country LifeExpectancy LifeExpectancy.Group
## <chr> <dbl> <fctr>
## 1 Bosnia and Herzegovina 77.4 70-80
## 2 Botswana 65.7 60-70
## 3 Switzerland 83.4 more 80
## 4 Micronesia (Federated States of) 69.4 60-70
## 5 Egypt 70.9 70-80
## 6 Estonia 77.6 70-80
## 7 Malawi 58.3 less 60
## 8 Niger 61.8 60-70
## 9 Norway 81.8 more 80
## 10 Oman 76.6 70-80
## # ... with 171 more rows
WHOData %>% select(region, LifeExpectancy) %>%
mutate(LifeExpectancy.Group = cut(LifeExpectancy,
breaks = seq(50, 90, 10),
labels = c("less 60", "60-70",
"70-80", "more 80"))) %>%
ggplot(aes(LifeExpectancy.Group, fill = region)) + geom_bar()
Используется для переименования переменных
WHOData %>% select(country, LiteracyRate) %>%
mutate(LiteracyRate.Part = LiteracyRate * .01) %>%
rename(Literacy = LiteracyRate.Part) %>%
tbl_df()
## # A tibble: 181 × 3
## country LiteracyRate Literacy
## <chr> <int> <dbl>
## 1 Bosnia and Herzegovina 98 0.98
## 2 Botswana 85 0.85
## 3 Switzerland NA NA
## 4 Micronesia (Federated States of) NA NA
## 5 Egypt 74 0.74
## 6 Estonia 100 1.00
## 7 Malawi 61 0.61
## 8 Niger NA NA
## 9 Norway NA NA
## 10 Oman 87 0.87
## # ... with 171 more rows
Функция group_by() группирует значения в соответствии с выбранными значениями переменных, summarise() вычислеяет суммарные (но не только суммы!:)) по группам.
WHOData %>% select(worldbankincomegroup, PerCapitaTotalHealthExpenditure) %>%
group_by(worldbankincomegroup) %>%
summarise(PerCapitaTotalHealthExpenditure.Median = median(PerCapitaTotalHealthExpenditure),
count.gr = n()) %>%
tbl_df()
## # A tibble: 4 × 3
## worldbankincomegroup PerCapitaTotalHealthExpenditure.Median count.gr
## <fctr> <dbl> <int>
## 1 Low-income 90.960 29
## 2 Lower-middle-income 299.410 49
## 3 Upper-middle-income 906.355 48
## 4 High-income 2530.570 55
WHOData %>% select(worldbankincomegroup, PerCapitaTotalHealthExpenditure) %>%
group_by(worldbankincomegroup) %>%
summarise(PerCapitaTotalHealthExpenditure.Median = median(PerCapitaTotalHealthExpenditure),
PCTHE.min = min(PerCapitaTotalHealthExpenditure),
PCTHE.max = max(PerCapitaTotalHealthExpenditure)) %>%
ggplot(aes(worldbankincomegroup,
PerCapitaTotalHealthExpenditure.Median)) +
geom_bar(stat = "identity", colour = "grey", fill = "lightgrey") +
geom_point(size = 2) +
geom_errorbar(aes(ymin = PCTHE.min,
ymax = PCTHE.max,
width = .25)) +
labs(title = "Per Capita Total Health Expenditure (PPP int. $), Median, Min, Max", x = "", y = "")
Данные последнего графика можно было визуализировать и более простым путем создания boxplot
WHOData %>%
ggplot(aes(worldbankincomegroup, PerCapitaTotalHealthExpenditure)) +
geom_boxplot()
To be continued