Apply the following dplyr verbs to your data

Filter rows

filter(data, region == "Europe & Central Asia", year == 2023)

## # A tibble: 58 × 12
##    iso3c country     region income  year population overall_score data_use_score
##    <chr> <chr>       <chr>  <chr>  <dbl>      <dbl> <chr>                  <dbl>
##  1 DNK   Denmark     Europ… High …  2023    5946952 95.255833333…            100
##  2 FIN   Finland     Europ… High …  2023    5584264 95.115416666…            100
##  3 POL   Poland      Europ… High …  2023   36685849 94.65375                 100
##  4 SWE   Sweden      Europ… High …  2023   10536632 94.41                    100
##  5 ESP   Spain       Europ… High …  2023   48373336 94.325                   100
##  6 NLD   Netherlands Europ… High …  2023   17879488 94.3225                  100
##  7 SVN   Slovenia    Europ… High …  2023    2120937 94.220416666…            100
##  8 PRT   Portugal    Europ… High …  2023   10525347 93.778333333…            100
##  9 ITA   Italy       Europ… High …  2023   58761146 93.638750000…            100
## 10 NOR   Norway      Europ… High …  2023    5519594 93.556666666…            100
## # ℹ 48 more rows
## # ℹ 4 more variables: data_services_score <chr>, data_products_score <chr>,
## #   data_sources_score <chr>, data_infrastructure_score <chr>

Arrange rows

arrange(data, desc(population))

## # A tibble: 4,340 × 12
##    iso3c country region     income  year population overall_score data_use_score
##    <chr> <chr>   <chr>      <chr>  <dbl>      <dbl> <chr>                  <dbl>
##  1 IND   India   South Asia Lower…  2023 1428627663 73.628333333…           80  
##  2 IND   India   South Asia Lower…  2022 1417173173 74.600833333…           80  
##  3 CHN   China   East Asia… Upper…  2021 1412360000 58.978750000…           83.4
##  4 CHN   China   East Asia… Upper…  2022 1412175000 56.5275                 73.4
##  5 CHN   China   East Asia… Upper…  2020 1411100000 57.8075                 83.4
##  6 CHN   China   East Asia… Upper…  2023 1410710000 59.50375                73.4
##  7 CHN   China   East Asia… Upper…  2019 1407745000 59.885                  83.4
##  8 IND   India   South Asia Lower…  2021 1407563842 77.087916666…           80  
##  9 CHN   China   East Asia… Upper…  2018 1402760000 56.369583333…           83.4
## 10 IND   India   South Asia Lower…  2020 1396387127 73.148749999…           80  
## # ℹ 4,330 more rows
## # ℹ 4 more variables: data_services_score <chr>, data_products_score <chr>,
## #   data_sources_score <chr>, data_infrastructure_score <chr>

Select columns

select(data, country, region, overall_score)

## # A tibble: 4,340 × 3
##    country     region                overall_score    
##    <chr>       <chr>                 <chr>            
##  1 Denmark     Europe & Central Asia 95.25583333333334
##  2 Finland     Europe & Central Asia 95.11541666666668
##  3 Poland      Europe & Central Asia 94.65375         
##  4 Sweden      Europe & Central Asia 94.41            
##  5 Spain       Europe & Central Asia 94.325           
##  6 Netherlands Europe & Central Asia 94.3225          
##  7 Slovenia    Europe & Central Asia 94.22041666666668
##  8 Portugal    Europe & Central Asia 93.77833333333332
##  9 Italy       Europe & Central Asia 93.63875000000002
## 10 Norway      Europe & Central Asia 93.55666666666669
## # ℹ 4,330 more rows

Add columns

mutate(data,
       overall_score = as.numeric(overall_score),
       population = as.numeric(population),
       score_per_million = overall_score / (population / 1e6)
)

## # A tibble: 4,340 × 13
##    iso3c country     region income  year population overall_score data_use_score
##    <chr> <chr>       <chr>  <chr>  <dbl>      <dbl>         <dbl>          <dbl>
##  1 DNK   Denmark     Europ… High …  2023    5946952          95.3            100
##  2 FIN   Finland     Europ… High …  2023    5584264          95.1            100
##  3 POL   Poland      Europ… High …  2023   36685849          94.7            100
##  4 SWE   Sweden      Europ… High …  2023   10536632          94.4            100
##  5 ESP   Spain       Europ… High …  2023   48373336          94.3            100
##  6 NLD   Netherlands Europ… High …  2023   17879488          94.3            100
##  7 SVN   Slovenia    Europ… High …  2023    2120937          94.2            100
##  8 PRT   Portugal    Europ… High …  2023   10525347          93.8            100
##  9 ITA   Italy       Europ… High …  2023   58761146          93.6            100
## 10 NOR   Norway      Europ… High …  2023    5519594          93.6            100
## # ℹ 4,330 more rows
## # ℹ 5 more variables: data_services_score <chr>, data_products_score <chr>,
## #   data_sources_score <chr>, data_infrastructure_score <chr>,
## #   score_per_million <dbl>

Summarize by groups

data %>%
  # Ensure overall_score is numeric
  mutate(overall_score = as.numeric(overall_score)) %>%
  
  # Group by region
  group_by(country) %>%
  
  # Calculate average overall score
  summarise(avg_score = mean(overall_score, na.rm = TRUE)) %>%
  
  # Sort it
  arrange(desc(avg_score))

## # A tibble: 217 × 2
##    country       avg_score
##    <chr>             <dbl>
##  1 Finland            90.9
##  2 Sweden             90.7
##  3 Slovenia           90.2
##  4 Italy              90.0
##  5 Poland             89.4
##  6 Spain              89.4
##  7 United States      89.3
##  8 Canada             89.1
##  9 Denmark            88.8
## 10 Germany            88.7
## # ℹ 207 more rows

Module 6: Apply 5

Jennifer Chorvatovic

Import data

Apply the following dplyr verbs to your data

Filter rows

Arrange rows

Select columns

Add columns

Summarize by groups