R Markdown

Assignment 1

importing data from difference sources

Load data from CSV

This section loads the energy consumption dataset from a local CSV file.

The dataset contains appliance energy usage and environmental sensor readings

recorded every 10 minutes from a house in Belgium (January–May 2016).

library(readr)
## Warning: package 'readr' was built under R version 4.5.3
data <- read_csv("E:/Document/BIG Data Masters/Course Semester 1/Semester 1/R Programming for Data Science/energydata_complete.csv")
## Rows: 19735 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (28): Appliances, lights, T1, RH_1, T2, RH_2, T3, RH_3, T4, RH_4, T5, R...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data)
## # A tibble: 6 × 29
##   date                Appliances lights    T1  RH_1    T2  RH_2    T3  RH_3
##   <dttm>                   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2016-01-11 17:00:00         60     30  19.9  47.6  19.2  44.8  19.8  44.7
## 2 2016-01-11 17:10:00         60     30  19.9  46.7  19.2  44.7  19.8  44.8
## 3 2016-01-11 17:20:00         50     30  19.9  46.3  19.2  44.6  19.8  44.9
## 4 2016-01-11 17:30:00         50     40  19.9  46.1  19.2  44.6  19.8  45  
## 5 2016-01-11 17:40:00         60     40  19.9  46.3  19.2  44.5  19.8  45  
## 6 2016-01-11 17:50:00         50     40  19.9  46.0  19.2  44.5  19.8  44.9
## # ℹ 20 more variables: T4 <dbl>, RH_4 <dbl>, T5 <dbl>, RH_5 <dbl>, T6 <dbl>,
## #   RH_6 <dbl>, T7 <dbl>, RH_7 <dbl>, T8 <dbl>, RH_8 <dbl>, T9 <dbl>,
## #   RH_9 <dbl>, T_out <dbl>, Press_mm_hg <dbl>, RH_out <dbl>, Windspeed <dbl>,
## #   Visibility <dbl>, Tdewpoint <dbl>, rv1 <dbl>, rv2 <dbl>
tail(data)
## # A tibble: 6 × 29
##   date                Appliances lights    T1  RH_1    T2  RH_2    T3  RH_3
##   <dttm>                   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2016-05-27 17:10:00         90      0  25.5  46.9  26.0  42.5  27.3  41.1
## 2 2016-05-27 17:20:00        100      0  25.6  46.6  25.9  42.0  27.2  41.2
## 3 2016-05-27 17:30:00         90      0  25.5  46.5  25.8  42.1  27.1  41.2
## 4 2016-05-27 17:40:00        270     10  25.5  46.6  25.6  42.8  27.0  41.7
## 5 2016-05-27 17:50:00        420     10  25.5  47.0  25.4  43.0  26.9  41.3
## 6 2016-05-27 18:00:00        430     10  25.5  46.6  25.3  43.0  26.8  41.2
## # ℹ 20 more variables: T4 <dbl>, RH_4 <dbl>, T5 <dbl>, RH_5 <dbl>, T6 <dbl>,
## #   RH_6 <dbl>, T7 <dbl>, RH_7 <dbl>, T8 <dbl>, RH_8 <dbl>, T9 <dbl>,
## #   RH_9 <dbl>, T_out <dbl>, Press_mm_hg <dbl>, RH_out <dbl>, Windspeed <dbl>,
## #   Visibility <dbl>, Tdewpoint <dbl>, rv1 <dbl>, rv2 <dbl>
summary(data)
##       date                       Appliances          lights      
##  Min.   :2016-01-11 17:00:00   Min.   :  10.00   Min.   : 0.000  
##  1st Qu.:2016-02-14 23:15:00   1st Qu.:  50.00   1st Qu.: 0.000  
##  Median :2016-03-20 05:30:00   Median :  60.00   Median : 0.000  
##  Mean   :2016-03-20 05:30:00   Mean   :  97.69   Mean   : 3.802  
##  3rd Qu.:2016-04-23 11:45:00   3rd Qu.: 100.00   3rd Qu.: 0.000  
##  Max.   :2016-05-27 18:00:00   Max.   :1080.00   Max.   :70.000  
##        T1             RH_1             T2             RH_2      
##  Min.   :16.79   Min.   :27.02   Min.   :16.10   Min.   :20.46  
##  1st Qu.:20.76   1st Qu.:37.33   1st Qu.:18.79   1st Qu.:37.90  
##  Median :21.60   Median :39.66   Median :20.00   Median :40.50  
##  Mean   :21.69   Mean   :40.26   Mean   :20.34   Mean   :40.42  
##  3rd Qu.:22.60   3rd Qu.:43.07   3rd Qu.:21.50   3rd Qu.:43.26  
##  Max.   :26.26   Max.   :63.36   Max.   :29.86   Max.   :56.03  
##        T3             RH_3             T4             RH_4      
##  Min.   :17.20   Min.   :28.77   Min.   :15.10   Min.   :27.66  
##  1st Qu.:20.79   1st Qu.:36.90   1st Qu.:19.53   1st Qu.:35.53  
##  Median :22.10   Median :38.53   Median :20.67   Median :38.40  
##  Mean   :22.27   Mean   :39.24   Mean   :20.86   Mean   :39.03  
##  3rd Qu.:23.29   3rd Qu.:41.76   3rd Qu.:22.10   3rd Qu.:42.16  
##  Max.   :29.24   Max.   :50.16   Max.   :26.20   Max.   :51.09  
##        T5             RH_5             T6              RH_6      
##  Min.   :15.33   Min.   :29.82   Min.   :-6.065   Min.   : 1.00  
##  1st Qu.:18.28   1st Qu.:45.40   1st Qu.: 3.627   1st Qu.:30.02  
##  Median :19.39   Median :49.09   Median : 7.300   Median :55.29  
##  Mean   :19.59   Mean   :50.95   Mean   : 7.911   Mean   :54.61  
##  3rd Qu.:20.62   3rd Qu.:53.66   3rd Qu.:11.256   3rd Qu.:83.23  
##  Max.   :25.80   Max.   :96.32   Max.   :28.290   Max.   :99.90  
##        T7             RH_7             T8             RH_8      
##  Min.   :15.39   Min.   :23.20   Min.   :16.31   Min.   :29.60  
##  1st Qu.:18.70   1st Qu.:31.50   1st Qu.:20.79   1st Qu.:39.07  
##  Median :20.03   Median :34.86   Median :22.10   Median :42.38  
##  Mean   :20.27   Mean   :35.39   Mean   :22.03   Mean   :42.94  
##  3rd Qu.:21.60   3rd Qu.:39.00   3rd Qu.:23.39   3rd Qu.:46.54  
##  Max.   :26.00   Max.   :51.40   Max.   :27.23   Max.   :58.78  
##        T9             RH_9           T_out         Press_mm_hg   
##  Min.   :14.89   Min.   :29.17   Min.   :-5.000   Min.   :729.3  
##  1st Qu.:18.00   1st Qu.:38.50   1st Qu.: 3.667   1st Qu.:750.9  
##  Median :19.39   Median :40.90   Median : 6.917   Median :756.1  
##  Mean   :19.49   Mean   :41.55   Mean   : 7.412   Mean   :755.5  
##  3rd Qu.:20.60   3rd Qu.:44.34   3rd Qu.:10.408   3rd Qu.:760.9  
##  Max.   :24.50   Max.   :53.33   Max.   :26.100   Max.   :772.3  
##      RH_out         Windspeed        Visibility      Tdewpoint     
##  Min.   : 24.00   Min.   : 0.000   Min.   : 1.00   Min.   :-6.600  
##  1st Qu.: 70.33   1st Qu.: 2.000   1st Qu.:29.00   1st Qu.: 0.900  
##  Median : 83.67   Median : 3.667   Median :40.00   Median : 3.433  
##  Mean   : 79.75   Mean   : 4.040   Mean   :38.33   Mean   : 3.761  
##  3rd Qu.: 91.67   3rd Qu.: 5.500   3rd Qu.:40.00   3rd Qu.: 6.567  
##  Max.   :100.00   Max.   :14.000   Max.   :66.00   Max.   :15.500  
##       rv1                 rv2           
##  Min.   : 0.005322   Min.   : 0.005322  
##  1st Qu.:12.497889   1st Qu.:12.497889  
##  Median :24.897653   Median :24.897653  
##  Mean   :24.988033   Mean   :24.988033  
##  3rd Qu.:37.583769   3rd Qu.:37.583769  
##  Max.   :49.996530   Max.   :49.996530
names(data)
##  [1] "date"        "Appliances"  "lights"      "T1"          "RH_1"       
##  [6] "T2"          "RH_2"        "T3"          "RH_3"        "T4"         
## [11] "RH_4"        "T5"          "RH_5"        "T6"          "RH_6"       
## [16] "T7"          "RH_7"        "T8"          "RH_8"        "T9"         
## [21] "RH_9"        "T_out"       "Press_mm_hg" "RH_out"      "Windspeed"  
## [26] "Visibility"  "Tdewpoint"   "rv1"         "rv2"
colSums(is.na(data))
##        date  Appliances      lights          T1        RH_1          T2 
##           0           0           0           0           0           0 
##        RH_2          T3        RH_3          T4        RH_4          T5 
##           0           0           0           0           0           0 
##        RH_5          T6        RH_6          T7        RH_7          T8 
##           0           0           0           0           0           0 
##        RH_8          T9        RH_9       T_out Press_mm_hg      RH_out 
##           0           0           0           0           0           0 
##   Windspeed  Visibility   Tdewpoint         rv1         rv2 
##           0           0           0           0           0
mean(data$Appliances)
## [1] 97.69496
sd(data$Appliances)
## [1] 102.5249
plot(data$Appliances)

hist(data$Appliances)

Load data from postgres Database

This section loads the energy salary prediction dataset from a postgres database.

library(DBI)
## Warning: package 'DBI' was built under R version 4.5.3
library(RPostgres)
## Warning: package 'RPostgres' was built under R version 4.5.3
con <- dbConnect(
  RPostgres::Postgres(),
  dbname = "mydb",
  host = "localhost",
  port = 5432,
  user = "admin",
  password = "admin123"
)

data <- dbGetQuery(con, "SELECT * FROM salary_prediction")
summary(data$salary)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   31867  119358  143453  145718  169492  333046
head(data)
##            job_title experience_years education_level skills_count
## 1        AI Engineer               10        Bachelor            2
## 2       Data Analyst                5        Bachelor           17
## 3 Frontend Developer               18             PhD            4
## 4   Business Analyst               19             PhD           13
## 5    Product Manager               15        Bachelor            7
## 6        AI Engineer                0     High School            4
##        industry company_size  location remote_work certifications salary
## 1    Healthcare       Medium     India      Hybrid              2 109413
## 2       Telecom        Small Australia          No              0  93764
## 3         Media       Medium Singapore          No              1 148123
## 4        Retail       Medium    Canada         Yes              0 189123
## 5 Manufacturing        Large    Sweden         Yes              0 165069
## 6     Education   Enterprise       USA          No              2 180351
tail(data)
##                 job_title experience_years education_level skills_count
## 249995   Business Analyst               13         Diploma           17
## 249996  Software Engineer               17             PhD            2
## 249997 Frontend Developer               20             PhD            7
## 249998   Business Analyst                1        Bachelor           12
## 249999     Data Scientist                0     High School            2
## 250000       Data Analyst               16         Diploma            2
##          industry company_size location remote_work certifications salary
## 249995    Finance        Large      USA         Yes              3 171896
## 249996    Telecom   Enterprise    India          No              1 127791
## 249997    Telecom      Startup   Remote          No              2 154593
## 249998     Retail   Enterprise    India         Yes              0  75988
## 249999 Consulting        Small   Sweden      Hybrid              5  90467
## 250000 Technology       Medium       UK          No              5 133084
dbDisconnect(con)

Load data from My Sql Database

This section loads the energy student academic performance dataset from a My Sql database.

library(DBI)
library(RMariaDB)
## Warning: package 'RMariaDB' was built under R version 4.5.3
con <- dbConnect(
  RMariaDB::MariaDB(),
  dbname = "student_academic_performance",
  host = "localhost",
  port = 3306,
  user = "root",
  password = ""
)
  data <- dbGetQuery(con, "SELECT * FROM student_performance")
 head(data)
##   student_id gender age study_hours_per_week attendance_rate parent_education
## 1    STU0001   Male  15                   25            63.8         Bachelor
## 2    STU0002 Female  15                    2            54.7         Bachelor
## 3    STU0003 Female  19                   10            90.5      High School
## 4    STU0004   Male  16                   26            66.8      High School
## 5    STU0005 Female  15                   25            73.0      High School
## 6    STU0006 Female  19                    8            85.2      High School
##   internet_access extracurricular previous_score final_score passed
## 1               0               0             41          67      0
## 2               0               0             83          28      0
## 3               0               0             73          49      0
## 4               0               0             75          70      0
## 5               0               0             67          77      0
## 6               0               0             40          37      0
tail(data)
##     student_id gender age study_hours_per_week attendance_rate parent_education
## 496    STU0496 Female  19                    6            78.3           Master
## 497    STU0497 Female  16                   27            61.1              PhD
## 498    STU0498 Female  18                   16            72.3           Master
## 499    STU0499   Male  17                   29            91.3             None
## 500    STU0500   Male  15                   29            75.4      High School
## 501 student_id gender   0                    0             0.0 parent_education
##     internet_access extracurricular previous_score final_score passed
## 496               0               0             51          27      0
## 497               0               0             47          74      0
## 498               0               0             52          61      0
## 499               0               0             39          86      0
## 500               0               0             34          74      0
## 501               0               0              0           0      0
  dbDisconnect(con)

Assignment 2

This section demonstrates how to merge 2 to 3 datasets using common key columns.

We load three global datasets — world population, CO2 emissions, and GDP —

then join them by country to enable cross-dataset analysis.

load data

#library(tidyverse)

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'tibble' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'stringr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ purrr     1.2.2
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## Warning: package 'janitor' was built under R version 4.5.3
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(visdat)
## Warning: package 'visdat' was built under R version 4.5.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.5.3
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.5.3
library(GGally)
## Warning: package 'GGally' was built under R version 4.5.3
library(dplyr)
library(purrr)


population <- read_csv("E:/Document/BIG Data Masters/Course Semester 1/Semester 1/R Programming for Data Science/world_population.csv",show_col_types = FALSE)

cos2 <- read_csv("E:/Document/BIG Data Masters/Course Semester 1/Semester 1/R Programming for Data Science/CO2_emission.csv",show_col_types = FALSE)
## New names:
## • `2019` -> `2019...34`
## • `2019` -> `2019...35`
gdp_Countries <- read_csv("E:/Document/BIG Data Masters/Course Semester 1/Semester 1/R Programming for Data Science/gdp_2020-2025.csv",show_col_types = FALSE)

How to use the clean data sets

library(janitor)

population <- population %>%
  clean_names()

cos2 <- cos2 %>%
  clean_names()

gdp_Countries <- gdp_Countries %>%
  clean_names()

This section demonstrates how to merge 2 to 3 datasets using common key columns.

using inner join function

merged_population_cos2 <- population %>%
  inner_join(
    cos2,
    by = c("country_territory" = "country_name")
  )

print(merged_population_cos2)
## # A tibble: 186 × 51
##     rank cca3  country_territory   capital          continent   x2022_population
##    <dbl> <chr> <chr>               <chr>            <chr>                  <dbl>
##  1    36 AFG   Afghanistan         Kabul            Asia                41128771
##  2   138 ALB   Albania             Tirana           Europe               2842321
##  3    34 DZA   Algeria             Algiers          Africa              44903225
##  4   213 ASM   American Samoa      Pago Pago        Oceania                44273
##  5   203 AND   Andorra             Andorra la Vella Europe                 79824
##  6    42 AGO   Angola              Luanda           Africa              35588987
##  7   201 ATG   Antigua and Barbuda Saint John’s     North Amer…            93763
##  8    33 ARG   Argentina           Buenos Aires     South Amer…         45510318
##  9   140 ARM   Armenia             Yerevan          Asia                 2780469
## 10   198 ABW   Aruba               Oranjestad       North Amer…           106445
## # ℹ 176 more rows
## # ℹ 45 more variables: x2020_population <dbl>, x2015_population <dbl>,
## #   x2010_population <dbl>, x2000_population <dbl>, x1990_population <dbl>,
## #   x1980_population <dbl>, x1970_population <dbl>, area_km2 <dbl>,
## #   density_per_km2 <dbl>, growth_rate <dbl>,
## #   world_population_percentage <dbl>, country_code <chr>, region <chr>,
## #   indicator_name <chr>, x1990 <dbl>, x1991 <dbl>, x1992 <dbl>, x1993 <dbl>, …
merged_all_dataset <- population %>%
  inner_join(
    cos2,
    by = c("country_territory" = "country_name")
  ) %>%
  inner_join(
    gdp_Countries,
    by = c("country_territory" = "country")
  )

print(merged_all_dataset)
## # A tibble: 168 × 57
##     rank cca3  country_territory   capital          continent   x2022_population
##    <dbl> <chr> <chr>               <chr>            <chr>                  <dbl>
##  1    36 AFG   Afghanistan         Kabul            Asia                41128771
##  2   138 ALB   Albania             Tirana           Europe               2842321
##  3    34 DZA   Algeria             Algiers          Africa              44903225
##  4   203 AND   Andorra             Andorra la Vella Europe                 79824
##  5    42 AGO   Angola              Luanda           Africa              35588987
##  6   201 ATG   Antigua and Barbuda Saint John’s     North Amer…            93763
##  7    33 ARG   Argentina           Buenos Aires     South Amer…         45510318
##  8   140 ARM   Armenia             Yerevan          Asia                 2780469
##  9   198 ABW   Aruba               Oranjestad       North Amer…           106445
## 10    55 AUS   Australia           Canberra         Oceania             26177413
## # ℹ 158 more rows
## # ℹ 51 more variables: x2020_population <dbl>, x2015_population <dbl>,
## #   x2010_population <dbl>, x2000_population <dbl>, x1990_population <dbl>,
## #   x1980_population <dbl>, x1970_population <dbl>, area_km2 <dbl>,
## #   density_per_km2 <dbl>, growth_rate <dbl>,
## #   world_population_percentage <dbl>, country_code <chr>, region <chr>,
## #   indicator_name <chr>, x1990 <dbl>, x1991 <dbl>, x1992 <dbl>, x1993 <dbl>, …

assignment 3 using Group by and %>%

This section groups the world population data by continent and calculates

the average 2022 population per continent. This helps compare population

distribution across different regions of the world.

population %>%
  group_by(continent) %>%
  summarise(
    avg_population =
      mean(x2022_population,
           na.rm = TRUE)
  )
## # A tibble: 6 × 2
##   continent     avg_population
##   <chr>                  <dbl>
## 1 Africa             25030367.
## 2 Asia               94427665.
## 3 Europe             14862951.
## 4 North America      15007403.
## 5 Oceania             1958198 
## 6 South America      31201186.
population %>%
  group_by(continent) %>%
  summarise(
    number_of_countries = n()
  )
## # A tibble: 6 × 2
##   continent     number_of_countries
##   <chr>                       <int>
## 1 Africa                         57
## 2 Asia                           50
## 3 Europe                         50
## 4 North America                  40
## 5 Oceania                        23
## 6 South America                  14

Select specific columns function help to select specific data

population %>%
  select(country_territory, continent, density_per_km2, growth_rate, area_km2, x2022_population)
## # A tibble: 234 × 6
##    country_territory   continent     density_per_km2 growth_rate area_km2
##    <chr>               <chr>                   <dbl>       <dbl>    <dbl>
##  1 Afghanistan         Asia                     63.1       1.03    652230
##  2 Albania             Europe                   98.9       0.996    28748
##  3 Algeria             Africa                   18.9       1.02   2381741
##  4 American Samoa      Oceania                 222.        0.983      199
##  5 Andorra             Europe                  171.        1.01       468
##  6 Angola              Africa                   28.5       1.03   1246700
##  7 Anguilla            North America           174.        1.01        91
##  8 Antigua and Barbuda North America           212.        1.01       442
##  9 Argentina           South America            16.4       1.01   2780400
## 10 Armenia             Asia                     93.5       0.996    29743
## # ℹ 224 more rows
## # ℹ 1 more variable: x2022_population <dbl>

Filter — only African countries help to get all needed data like african countries

population %>%
  filter(continent == "Africa")
## # A tibble: 57 × 17
##     rank cca3  country_territory        capital     continent x2022_population
##    <dbl> <chr> <chr>                    <chr>       <chr>                <dbl>
##  1    34 DZA   Algeria                  Algiers     Africa            44903225
##  2    42 AGO   Angola                   Luanda      Africa            35588987
##  3    77 BEN   Benin                    Porto-Novo  Africa            13352864
##  4   144 BWA   Botswana                 Gaborone    Africa             2630296
##  5    58 BFA   Burkina Faso             Ouagadougou Africa            22673762
##  6    78 BDI   Burundi                  Bujumbura   Africa            12889576
##  7    53 CMR   Cameroon                 Yaounde     Africa            27914536
##  8   171 CPV   Cape Verde               Praia       Africa              593149
##  9   117 CAF   Central African Republic Bangui      Africa             5579144
## 10    69 TCD   Chad                     N'Djamena   Africa            17723315
## # ℹ 47 more rows
## # ℹ 11 more variables: x2020_population <dbl>, x2015_population <dbl>,
## #   x2010_population <dbl>, x2000_population <dbl>, x1990_population <dbl>,
## #   x1980_population <dbl>, x1970_population <dbl>, area_km2 <dbl>,
## #   density_per_km2 <dbl>, growth_rate <dbl>, world_population_percentage <dbl>

Arrange — sort by 2022 population descending

population %>%
  arrange(desc(x2022_population))
## # A tibble: 234 × 17
##     rank cca3  country_territory capital          continent     x2022_population
##    <dbl> <chr> <chr>             <chr>            <chr>                    <dbl>
##  1     1 CHN   China             Beijing          Asia                1425887337
##  2     2 IND   India             New Delhi        Asia                1417173173
##  3     3 USA   United States     Washington, D.C. North America        338289857
##  4     4 IDN   Indonesia         Jakarta          Asia                 275501339
##  5     5 PAK   Pakistan          Islamabad        Asia                 235824862
##  6     6 NGA   Nigeria           Abuja            Africa               218541212
##  7     7 BRA   Brazil            Brasilia         South America        215313498
##  8     8 BGD   Bangladesh        Dhaka            Asia                 171186372
##  9     9 RUS   Russia            Moscow           Europe               144713314
## 10    10 MEX   Mexico            Mexico City      North America        127504125
## # ℹ 224 more rows
## # ℹ 11 more variables: x2020_population <dbl>, x2015_population <dbl>,
## #   x2010_population <dbl>, x2000_population <dbl>, x1990_population <dbl>,
## #   x1980_population <dbl>, x1970_population <dbl>, area_km2 <dbl>,
## #   density_per_km2 <dbl>, growth_rate <dbl>, world_population_percentage <dbl>

Rename — cleaner column names

pupulation_rename  <- population
print(pupulation_rename)
## # A tibble: 234 × 17
##     rank cca3  country_territory   capital          continent   x2022_population
##    <dbl> <chr> <chr>               <chr>            <chr>                  <dbl>
##  1    36 AFG   Afghanistan         Kabul            Asia                41128771
##  2   138 ALB   Albania             Tirana           Europe               2842321
##  3    34 DZA   Algeria             Algiers          Africa              44903225
##  4   213 ASM   American Samoa      Pago Pago        Oceania                44273
##  5   203 AND   Andorra             Andorra la Vella Europe                 79824
##  6    42 AGO   Angola              Luanda           Africa              35588987
##  7   224 AIA   Anguilla            The Valley       North Amer…            15857
##  8   201 ATG   Antigua and Barbuda Saint John’s     North Amer…            93763
##  9    33 ARG   Argentina           Buenos Aires     South Amer…         45510318
## 10   140 ARM   Armenia             Yerevan          Asia                 2780469
## # ℹ 224 more rows
## # ℹ 11 more variables: x2020_population <dbl>, x2015_population <dbl>,
## #   x2010_population <dbl>, x2000_population <dbl>, x1990_population <dbl>,
## #   x1980_population <dbl>, x1970_population <dbl>, area_km2 <dbl>,
## #   density_per_km2 <dbl>, growth_rate <dbl>, world_population_percentage <dbl>
pupulation_rename <- population %>%
  rename(
    country         = country_territory,
    population_2022 = x2022_population,
    density         = density_per_km2,
    area            = area_km2
  )

#print(population)

Mutate — add new columns

population %>%
  mutate(
    pop_growth_abs  = x2022_population - x2000_population,
    pop_growth_pct  = ((x2022_population - x2000_population) / x2000_population) * 100,
    density_category = case_when(
      density_per_km2 < 50  ~ "Low",
      density_per_km2 < 200 ~ "Medium",
      TRUE                  ~ "High"
    )
  )
## # A tibble: 234 × 20
##     rank cca3  country_territory   capital          continent   x2022_population
##    <dbl> <chr> <chr>               <chr>            <chr>                  <dbl>
##  1    36 AFG   Afghanistan         Kabul            Asia                41128771
##  2   138 ALB   Albania             Tirana           Europe               2842321
##  3    34 DZA   Algeria             Algiers          Africa              44903225
##  4   213 ASM   American Samoa      Pago Pago        Oceania                44273
##  5   203 AND   Andorra             Andorra la Vella Europe                 79824
##  6    42 AGO   Angola              Luanda           Africa              35588987
##  7   224 AIA   Anguilla            The Valley       North Amer…            15857
##  8   201 ATG   Antigua and Barbuda Saint John’s     North Amer…            93763
##  9    33 ARG   Argentina           Buenos Aires     South Amer…         45510318
## 10   140 ARM   Armenia             Yerevan          Asia                 2780469
## # ℹ 224 more rows
## # ℹ 14 more variables: x2020_population <dbl>, x2015_population <dbl>,
## #   x2010_population <dbl>, x2000_population <dbl>, x1990_population <dbl>,
## #   x1980_population <dbl>, x1970_population <dbl>, area_km2 <dbl>,
## #   density_per_km2 <dbl>, growth_rate <dbl>,
## #   world_population_percentage <dbl>, pop_growth_abs <dbl>,
## #   pop_growth_pct <dbl>, density_category <chr>

assignment 4: how to use trace and recover()

quadratic_equation <- function(a, b, c) {
  delta <- (b * b) - (4 * a * c)

  if (delta > 0) {
    r1 <- (-b + sqrt(delta)) / (2 * a)
    r2 <- (-b - sqrt(delta)) / (2 * a)
    print(r1)
    print(r2)

  } else if (delta == 0) {
    r1 <- -b / (2 * a)
    print(paste("One real root:", r1))

  } else {
    print("Complex roots exist.")
  }
}

Trace()

trace(
  "quadratic_equation",
  tracer = quote(print(paste("Inputs:", a, b, c))),
  print = FALSE
)
## [1] "quadratic_equation"
quadratic_equation(1, -5, 6)
## [1] "Inputs: 1 -5 6"
## [1] 3
## [1] 2
untrace("quadratic_equation")

Using recover()

recover() is used when an error happens.

It lets you inspect the environment where the error occurred.

quadratic_equation <- function(a, b, c) {
  delta <- (b * b) - (4 * a * c)
  
  if (delta > 0) {
    r1 <- (-b + sqrt(delta)) / (2 * a)
    r2 <- (-b - sqrt(delta)) / (2 * a)
    print(paste("Root 1:", round(r1, 2)))
    print(paste("Root 2:", round(r2, 2)))
    
  } else if (delta == 0) {
    r1 <- -b / (2 * a)
    print(paste("One root:", round(r1, 2)))
    
  } else {
    print("No real roots — delta is negative")
  }
}

options(error = NULL)
quadratic_equation(1, -5, 6)
## [1] "Root 1: 3"
## [1] "Root 2: 2"

Debugging Workflow

options(error = NULL)  # reset error handler

trace(
  "quadratic_equation",
  tracer = quote(print(paste("Delta =", delta))),
  at = 3,              # line 3 = first line AFTER delta is assigned
  print = FALSE
)
## [1] "quadratic_equation"
quadratic_equation(1, -5, 6)
## [1] "Delta = 1"
## [1] "Root 1: 3"
## [1] "Root 2: 2"
untrace("quadratic_equation")

assignment 5: use #how an we use this function sapply(), vapply(), lapply, map(), mapply() and summary statics

make function that makes summary statics

print(population)
## # A tibble: 234 × 17
##     rank cca3  country_territory   capital          continent   x2022_population
##    <dbl> <chr> <chr>               <chr>            <chr>                  <dbl>
##  1    36 AFG   Afghanistan         Kabul            Asia                41128771
##  2   138 ALB   Albania             Tirana           Europe               2842321
##  3    34 DZA   Algeria             Algiers          Africa              44903225
##  4   213 ASM   American Samoa      Pago Pago        Oceania                44273
##  5   203 AND   Andorra             Andorra la Vella Europe                 79824
##  6    42 AGO   Angola              Luanda           Africa              35588987
##  7   224 AIA   Anguilla            The Valley       North Amer…            15857
##  8   201 ATG   Antigua and Barbuda Saint John’s     North Amer…            93763
##  9    33 ARG   Argentina           Buenos Aires     South Amer…         45510318
## 10   140 ARM   Armenia             Yerevan          Asia                 2780469
## # ℹ 224 more rows
## # ℹ 11 more variables: x2020_population <dbl>, x2015_population <dbl>,
## #   x2010_population <dbl>, x2000_population <dbl>, x1990_population <dbl>,
## #   x1980_population <dbl>, x1970_population <dbl>, area_km2 <dbl>,
## #   density_per_km2 <dbl>, growth_rate <dbl>, world_population_percentage <dbl>
summary_stats <- function(data, column) {
  values <- data[[column]]
  
  cat("Summary Statistics for:", column, "\n")
  cat("Count   :", length(values), "\n")
  cat("Mean    :", round(mean(values, na.rm = TRUE), 2), "\n")
  cat("Median  :", round(median(values, na.rm = TRUE), 2), "\n")
  cat("Std Dev :", round(sd(values, na.rm = TRUE), 2), "\n")
  cat("Min     :", round(min(values, na.rm = TRUE), 2), "\n")
  cat("Max     :", round(max(values, na.rm = TRUE), 2), "\n")
  cat("Missing :", sum(is.na(values)), "\n")
}

# Run on population column
summary_stats(population, "x2022_population")
## Summary Statistics for: x2022_population 
## Count   : 234 
## Mean    : 34074415 
## Median  : 5559945 
## Std Dev : 136766425 
## Min     : 510 
## Max     : 1425887337 
## Missing : 0
summary_stats(population, "density_per_km2")
## Summary Statistics for: density_per_km2 
## Count   : 234 
## Mean    : 452.13 
## Median  : 95.35 
## Std Dev : 2066.12 
## Min     : 0.03 
## Max     : 23172.27 
## Missing : 0
summary_stats(population, "area_km2")
## Summary Statistics for: area_km2 
## Count   : 234 
## Mean    : 581449.4 
## Median  : 81199.5 
## Std Dev : 1761841 
## Min     : 1 
## Max     : 17098242 
## Missing : 0
summary_stats(population, "growth_rate")
## Summary Statistics for: growth_rate 
## Count   : 234 
## Mean    : 1.01 
## Median  : 1.01 
## Std Dev : 0.01 
## Min     : 0.91 
## Max     : 1.07 
## Missing : 0

using sapply

sapply(
  population[, c("density_per_km2", "growth_rate", "x2022_population")],
  mean, na.rm = TRUE
)
##  density_per_km2      growth_rate x2022_population 
##     4.521270e+02     1.009577e+00     3.407441e+07

using lapply

lapply(
  population[, c("density_per_km2", "growth_rate", "x2022_population")],
  summary
)
## $density_per_km2
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 2.610e-02 3.842e+01 9.535e+01 4.521e+02 2.389e+02 2.317e+04 
## 
## $growth_rate
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.912   1.002   1.008   1.010   1.017   1.069 
## 
## $x2022_population
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 5.100e+02 4.197e+05 5.560e+06 3.407e+07 2.248e+07 1.426e+09

using vapply

vapply(
  population[, c("density_per_km2", "growth_rate", "x2022_population")],
  function(x) mean(x, na.rm = TRUE),
  numeric(1)
)
##  density_per_km2      growth_rate x2022_population 
##     4.521270e+02     1.009577e+00     3.407441e+07

population_split by continent

population_split <- split(population, population$continent)
str(population_split)
## List of 6
##  $ Africa       : tibble [57 × 17] (S3: tbl_df/tbl/data.frame)
##   ..$ rank                       : num [1:57] 34 42 77 144 58 78 53 171 117 69 ...
##   ..$ cca3                       : chr [1:57] "DZA" "AGO" "BEN" "BWA" ...
##   ..$ country_territory          : chr [1:57] "Algeria" "Angola" "Benin" "Botswana" ...
##   ..$ capital                    : chr [1:57] "Algiers" "Luanda" "Porto-Novo" "Gaborone" ...
##   ..$ continent                  : chr [1:57] "Africa" "Africa" "Africa" "Africa" ...
##   ..$ x2022_population           : num [1:57] 44903225 35588987 13352864 2630296 22673762 ...
##   ..$ x2020_population           : num [1:57] 43451666 33428485 12643123 2546402 21522626 ...
##   ..$ x2015_population           : num [1:57] 39543154 28127721 10932783 2305171 18718019 ...
##   ..$ x2010_population           : num [1:57] 35856344 23364185 9445710 2091664 16116845 ...
##   ..$ x2000_population           : num [1:57] 30774621 16394062 6998023 1726985 11882888 ...
##   ..$ x1990_population           : num [1:57] 25518074 11828638 5133419 1341474 9131361 ...
##   ..$ x1980_population           : num [1:57] 18739378 8330047 3833939 938578 6932967 ...
##   ..$ x1970_population           : num [1:57] 13795915 6029700 3023443 592244 5611666 ...
##   ..$ area_km2                   : num [1:57] 2381741 1246700 112622 582000 272967 ...
##   ..$ density_per_km2            : num [1:57] 18.85 28.55 118.56 4.52 83.06 ...
##   ..$ growth_rate                : num [1:57] 1.02 1.03 1.03 1.02 1.03 ...
##   ..$ world_population_percentage: num [1:57] 0.56 0.45 0.17 0.03 0.28 0.16 0.35 0.01 0.07 0.22 ...
##  $ Asia         : tibble [50 × 17] (S3: tbl_df/tbl/data.frame)
##   ..$ rank                       : num [1:50] 36 140 91 154 8 165 175 73 1 131 ...
##   ..$ cca3                       : chr [1:50] "AFG" "ARM" "AZE" "BHR" ...
##   ..$ country_territory          : chr [1:50] "Afghanistan" "Armenia" "Azerbaijan" "Bahrain" ...
##   ..$ capital                    : chr [1:50] "Kabul" "Yerevan" "Baku" "Manama" ...
##   ..$ continent                  : chr [1:50] "Asia" "Asia" "Asia" "Asia" ...
##   ..$ x2022_population           : num [1:50] 4.11e+07 2.78e+06 1.04e+07 1.47e+06 1.71e+08 ...
##   ..$ x2020_population           : num [1:50] 3.90e+07 2.81e+06 1.03e+07 1.48e+06 1.67e+08 ...
##   ..$ x2015_population           : num [1:50] 3.38e+07 2.88e+06 9.86e+06 1.36e+06 1.58e+08 ...
##   ..$ x2010_population           : num [1:50] 2.82e+07 2.95e+06 9.24e+06 1.21e+06 1.48e+08 ...
##   ..$ x2000_population           : num [1:50] 1.95e+07 3.17e+06 8.19e+06 7.11e+05 1.29e+08 ...
##   ..$ x1990_population           : num [1:50] 1.07e+07 3.56e+06 7.43e+06 5.17e+05 1.07e+08 ...
##   ..$ x1980_population           : num [1:50] 12486631 3135123 6383060 362595 83929765 ...
##   ..$ x1970_population           : num [1:50] 10752971 2534377 5425317 222555 67541860 ...
##   ..$ area_km2                   : num [1:50] 652230 29743 86600 765 147570 ...
##   ..$ density_per_km2            : num [1:50] 63.1 93.5 119.6 1924.5 1160 ...
##   ..$ growth_rate                : num [1:50] 1.026 0.996 1.004 1.006 1.011 ...
##   ..$ world_population_percentage: num [1:50] 0.52 0.03 0.13 0.02 2.15 ...
##  $ Europe       : tibble [50 × 17] (S3: tbl_df/tbl/data.frame)
##   ..$ rank                       : num [1:50] 138 203 99 96 81 137 108 130 158 88 ...
##   ..$ cca3                       : chr [1:50] "ALB" "AND" "AUT" "BLR" ...
##   ..$ country_territory          : chr [1:50] "Albania" "Andorra" "Austria" "Belarus" ...
##   ..$ capital                    : chr [1:50] "Tirana" "Andorra la Vella" "Vienna" "Minsk" ...
##   ..$ continent                  : chr [1:50] "Europe" "Europe" "Europe" "Europe" ...
##   ..$ x2022_population           : num [1:50] 2842321 79824 8939617 9534954 11655930 ...
##   ..$ x2020_population           : num [1:50] 2866849 77700 8907777 9633740 11561717 ...
##   ..$ x2015_population           : num [1:50] 2882481 71746 8642421 9700609 11248303 ...
##   ..$ x2010_population           : num [1:50] 2913399 71519 8362829 9731427 10877947 ...
##   ..$ x2000_population           : num [1:50] 3182021 66097 8010428 10256483 10264343 ...
##   ..$ x1990_population           : num [1:50] 3295066 53569 7678729 10428525 9959560 ...
##   ..$ x1980_population           : num [1:50] 2941651 35611 7547561 9817257 9828986 ...
##   ..$ x1970_population           : num [1:50] 2324731 19860 7465301 9170786 9629376 ...
##   ..$ area_km2                   : num [1:50] 28748 468 83871 207600 30528 ...
##   ..$ density_per_km2            : num [1:50] 98.9 170.6 106.6 45.9 381.8 ...
##   ..$ growth_rate                : num [1:50] 0.996 1.01 1.002 0.996 1.004 ...
##   ..$ world_population_percentage: num [1:50] 0.04 0 0.11 0.12 0.15 0.04 0.09 0.05 0.02 0.13 ...
##  $ North America: tibble [40 × 17] (S3: tbl_df/tbl/data.frame)
##   ..$ rank                       : num [1:40] 224 201 198 176 186 177 206 221 39 205 ...
##   ..$ cca3                       : chr [1:40] "AIA" "ATG" "ABW" "BHS" ...
##   ..$ country_territory          : chr [1:40] "Anguilla" "Antigua and Barbuda" "Aruba" "Bahamas" ...
##   ..$ capital                    : chr [1:40] "The Valley" "Saint John’s" "Oranjestad" "Nassau" ...
##   ..$ continent                  : chr [1:40] "North America" "North America" "North America" "North America" ...
##   ..$ x2022_population           : num [1:40] 15857 93763 106445 409984 281635 ...
##   ..$ x2020_population           : num [1:40] 15585 92664 106585 406471 280693 ...
##   ..$ x2015_population           : num [1:40] 14525 89941 104257 392697 278083 ...
##   ..$ x2010_population           : num [1:40] 13172 85695 100341 373272 274711 ...
##   ..$ x2000_population           : num [1:40] 11047 75055 89101 325014 264657 ...
##   ..$ x1990_population           : num [1:40] 8316 63328 65712 270679 258868 ...
##   ..$ x1980_population           : num [1:40] 6560 64888 62267 223752 253575 ...
##   ..$ x1970_population           : num [1:40] 6283 64516 59106 179129 241397 ...
##   ..$ area_km2                   : num [1:40] 91 442 180 13943 430 ...
##   ..$ density_per_km2            : num [1:40] 174.3 212.1 591.4 29.4 655 ...
##   ..$ growth_rate                : num [1:40] 1.007 1.006 0.999 1.005 1.002 ...
##   ..$ world_population_percentage: num [1:40] 0 0 0 0.01 0 0.01 0 0 0.48 0 ...
##  $ Oceania      : tibble [23 × 17] (S3: tbl_df/tbl/data.frame)
##   ..$ rank                       : num [1:23] 213 55 223 162 183 191 192 215 194 225 ...
##   ..$ cca3                       : chr [1:23] "ASM" "AUS" "COK" "FJI" ...
##   ..$ country_territory          : chr [1:23] "American Samoa" "Australia" "Cook Islands" "Fiji" ...
##   ..$ capital                    : chr [1:23] "Pago Pago" "Canberra" "Avarua" "Suva" ...
##   ..$ continent                  : chr [1:23] "Oceania" "Oceania" "Oceania" "Oceania" ...
##   ..$ x2022_population           : num [1:23] 44273 26177413 17011 929766 306279 ...
##   ..$ x2020_population           : num [1:23] 46189 25670051 17029 920422 301920 ...
##   ..$ x2015_population           : num [1:23] 51368 23820236 17695 917200 291787 ...
##   ..$ x2010_population           : num [1:23] 54849 22019168 17212 905169 283788 ...
##   ..$ x2000_population           : num [1:23] 58230 19017963 15897 832509 250927 ...
##   ..$ x1990_population           : num [1:23] 47818 17048003 17123 780430 211089 ...
##   ..$ x1980_population           : num [1:23] 32886 14706322 17651 644582 163591 ...
##   ..$ x1970_population           : num [1:23] 27075 12595034 20470 527634 117891 ...
##   ..$ area_km2                   : num [1:23] 199 7692024 236 18272 4167 ...
##   ..$ density_per_km2            : num [1:23] 222.5 3.4 72.1 50.9 73.5 ...
##   ..$ growth_rate                : num [1:23] 0.983 1.01 1 1.006 1.007 ...
##   ..$ world_population_percentage: num [1:23] 0 0.33 0 0.01 0 0 0 0 0 0 ...
##  $ South America: tibble [14 × 17] (S3: tbl_df/tbl/data.frame)
##   ..$ rank                       : num [1:14] 33 80 7 65 28 67 231 184 164 109 ...
##   ..$ cca3                       : chr [1:14] "ARG" "BOL" "BRA" "CHL" ...
##   ..$ country_territory          : chr [1:14] "Argentina" "Bolivia" "Brazil" "Chile" ...
##   ..$ capital                    : chr [1:14] "Buenos Aires" "Sucre" "Brasilia" "Santiago" ...
##   ..$ continent                  : chr [1:14] "South America" "South America" "South America" "South America" ...
##   ..$ x2022_population           : num [1:14] 4.55e+07 1.22e+07 2.15e+08 1.96e+07 5.19e+07 ...
##   ..$ x2020_population           : num [1:14] 4.50e+07 1.19e+07 2.13e+08 1.93e+07 5.09e+07 ...
##   ..$ x2015_population           : num [1:14] 4.33e+07 1.11e+07 2.05e+08 1.79e+07 4.71e+07 ...
##   ..$ x2010_population           : num [1:14] 4.11e+07 1.02e+07 1.96e+08 1.70e+07 4.48e+07 ...
##   ..$ x2000_population           : num [1:14] 3.71e+07 8.59e+06 1.76e+08 1.54e+07 3.92e+07 ...
##   ..$ x1990_population           : num [1:14] 3.26e+07 7.10e+06 1.51e+08 1.33e+07 3.26e+07 ...
##   ..$ x1980_population           : num [1:14] 2.80e+07 5.74e+06 1.22e+08 1.15e+07 2.62e+07 ...
##   ..$ x1970_population           : num [1:14] 23842803 4585693 96369875 9820481 20905254 ...
##   ..$ area_km2                   : num [1:14] 2780400 1098581 8515767 756102 1141748 ...
##   ..$ density_per_km2            : num [1:14] 16.4 11.1 25.3 25.9 45.4 ...
##   ..$ growth_rate                : num [1:14] 1.01 1.01 1 1.01 1.01 ...
##   ..$ world_population_percentage: num [1:14] 0.57 0.15 2.7 0.25 0.65 0.23 0 0 0.01 0.09 ...
  tapply(
  population$density_per_km2,
  population$continent,
  mean, na.rm = TRUE
)
##        Africa          Asia        Europe North America       Oceania 
##     125.04765    1025.02414     663.32474     272.76176     132.54307 
## South America 
##      20.97198

using map

library(purrr)  # add this if not already loaded

population[, c("density_per_km2", "growth_rate", "x2022_population")] %>%
  map(function(x) mean(x, na.rm = TRUE))
## $density_per_km2
## [1] 452.127
## 
## $growth_rate
## [1] 1.009577
## 
## $x2022_population
## [1] 34074415
print(population)
## # A tibble: 234 × 17
##     rank cca3  country_territory   capital          continent   x2022_population
##    <dbl> <chr> <chr>               <chr>            <chr>                  <dbl>
##  1    36 AFG   Afghanistan         Kabul            Asia                41128771
##  2   138 ALB   Albania             Tirana           Europe               2842321
##  3    34 DZA   Algeria             Algiers          Africa              44903225
##  4   213 ASM   American Samoa      Pago Pago        Oceania                44273
##  5   203 AND   Andorra             Andorra la Vella Europe                 79824
##  6    42 AGO   Angola              Luanda           Africa              35588987
##  7   224 AIA   Anguilla            The Valley       North Amer…            15857
##  8   201 ATG   Antigua and Barbuda Saint John’s     North Amer…            93763
##  9    33 ARG   Argentina           Buenos Aires     South Amer…         45510318
## 10   140 ARM   Armenia             Yerevan          Asia                 2780469
## # ℹ 224 more rows
## # ℹ 11 more variables: x2020_population <dbl>, x2015_population <dbl>,
## #   x2010_population <dbl>, x2000_population <dbl>, x1990_population <dbl>,
## #   x1980_population <dbl>, x1970_population <dbl>, area_km2 <dbl>,
## #   density_per_km2 <dbl>, growth_rate <dbl>, world_population_percentage <dbl>