R Markdown
Assignment 1
importing data from difference sources
Load data from CSV
This section loads the energy consumption dataset from a local CSV
file.
The dataset contains appliance energy usage and environmental sensor
readings
recorded every 10 minutes from a house in Belgium (January–May
2016).
library(readr)
## Warning: package 'readr' was built under R version 4.5.3
data <- read_csv("E:/Document/BIG Data Masters/Course Semester 1/Semester 1/R Programming for Data Science/energydata_complete.csv")
## Rows: 19735 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (28): Appliances, lights, T1, RH_1, T2, RH_2, T3, RH_3, T4, RH_4, T5, R...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data)
## # A tibble: 6 × 29
## date Appliances lights T1 RH_1 T2 RH_2 T3 RH_3
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2016-01-11 17:00:00 60 30 19.9 47.6 19.2 44.8 19.8 44.7
## 2 2016-01-11 17:10:00 60 30 19.9 46.7 19.2 44.7 19.8 44.8
## 3 2016-01-11 17:20:00 50 30 19.9 46.3 19.2 44.6 19.8 44.9
## 4 2016-01-11 17:30:00 50 40 19.9 46.1 19.2 44.6 19.8 45
## 5 2016-01-11 17:40:00 60 40 19.9 46.3 19.2 44.5 19.8 45
## 6 2016-01-11 17:50:00 50 40 19.9 46.0 19.2 44.5 19.8 44.9
## # ℹ 20 more variables: T4 <dbl>, RH_4 <dbl>, T5 <dbl>, RH_5 <dbl>, T6 <dbl>,
## # RH_6 <dbl>, T7 <dbl>, RH_7 <dbl>, T8 <dbl>, RH_8 <dbl>, T9 <dbl>,
## # RH_9 <dbl>, T_out <dbl>, Press_mm_hg <dbl>, RH_out <dbl>, Windspeed <dbl>,
## # Visibility <dbl>, Tdewpoint <dbl>, rv1 <dbl>, rv2 <dbl>
tail(data)
## # A tibble: 6 × 29
## date Appliances lights T1 RH_1 T2 RH_2 T3 RH_3
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2016-05-27 17:10:00 90 0 25.5 46.9 26.0 42.5 27.3 41.1
## 2 2016-05-27 17:20:00 100 0 25.6 46.6 25.9 42.0 27.2 41.2
## 3 2016-05-27 17:30:00 90 0 25.5 46.5 25.8 42.1 27.1 41.2
## 4 2016-05-27 17:40:00 270 10 25.5 46.6 25.6 42.8 27.0 41.7
## 5 2016-05-27 17:50:00 420 10 25.5 47.0 25.4 43.0 26.9 41.3
## 6 2016-05-27 18:00:00 430 10 25.5 46.6 25.3 43.0 26.8 41.2
## # ℹ 20 more variables: T4 <dbl>, RH_4 <dbl>, T5 <dbl>, RH_5 <dbl>, T6 <dbl>,
## # RH_6 <dbl>, T7 <dbl>, RH_7 <dbl>, T8 <dbl>, RH_8 <dbl>, T9 <dbl>,
## # RH_9 <dbl>, T_out <dbl>, Press_mm_hg <dbl>, RH_out <dbl>, Windspeed <dbl>,
## # Visibility <dbl>, Tdewpoint <dbl>, rv1 <dbl>, rv2 <dbl>
summary(data)
## date Appliances lights
## Min. :2016-01-11 17:00:00 Min. : 10.00 Min. : 0.000
## 1st Qu.:2016-02-14 23:15:00 1st Qu.: 50.00 1st Qu.: 0.000
## Median :2016-03-20 05:30:00 Median : 60.00 Median : 0.000
## Mean :2016-03-20 05:30:00 Mean : 97.69 Mean : 3.802
## 3rd Qu.:2016-04-23 11:45:00 3rd Qu.: 100.00 3rd Qu.: 0.000
## Max. :2016-05-27 18:00:00 Max. :1080.00 Max. :70.000
## T1 RH_1 T2 RH_2
## Min. :16.79 Min. :27.02 Min. :16.10 Min. :20.46
## 1st Qu.:20.76 1st Qu.:37.33 1st Qu.:18.79 1st Qu.:37.90
## Median :21.60 Median :39.66 Median :20.00 Median :40.50
## Mean :21.69 Mean :40.26 Mean :20.34 Mean :40.42
## 3rd Qu.:22.60 3rd Qu.:43.07 3rd Qu.:21.50 3rd Qu.:43.26
## Max. :26.26 Max. :63.36 Max. :29.86 Max. :56.03
## T3 RH_3 T4 RH_4
## Min. :17.20 Min. :28.77 Min. :15.10 Min. :27.66
## 1st Qu.:20.79 1st Qu.:36.90 1st Qu.:19.53 1st Qu.:35.53
## Median :22.10 Median :38.53 Median :20.67 Median :38.40
## Mean :22.27 Mean :39.24 Mean :20.86 Mean :39.03
## 3rd Qu.:23.29 3rd Qu.:41.76 3rd Qu.:22.10 3rd Qu.:42.16
## Max. :29.24 Max. :50.16 Max. :26.20 Max. :51.09
## T5 RH_5 T6 RH_6
## Min. :15.33 Min. :29.82 Min. :-6.065 Min. : 1.00
## 1st Qu.:18.28 1st Qu.:45.40 1st Qu.: 3.627 1st Qu.:30.02
## Median :19.39 Median :49.09 Median : 7.300 Median :55.29
## Mean :19.59 Mean :50.95 Mean : 7.911 Mean :54.61
## 3rd Qu.:20.62 3rd Qu.:53.66 3rd Qu.:11.256 3rd Qu.:83.23
## Max. :25.80 Max. :96.32 Max. :28.290 Max. :99.90
## T7 RH_7 T8 RH_8
## Min. :15.39 Min. :23.20 Min. :16.31 Min. :29.60
## 1st Qu.:18.70 1st Qu.:31.50 1st Qu.:20.79 1st Qu.:39.07
## Median :20.03 Median :34.86 Median :22.10 Median :42.38
## Mean :20.27 Mean :35.39 Mean :22.03 Mean :42.94
## 3rd Qu.:21.60 3rd Qu.:39.00 3rd Qu.:23.39 3rd Qu.:46.54
## Max. :26.00 Max. :51.40 Max. :27.23 Max. :58.78
## T9 RH_9 T_out Press_mm_hg
## Min. :14.89 Min. :29.17 Min. :-5.000 Min. :729.3
## 1st Qu.:18.00 1st Qu.:38.50 1st Qu.: 3.667 1st Qu.:750.9
## Median :19.39 Median :40.90 Median : 6.917 Median :756.1
## Mean :19.49 Mean :41.55 Mean : 7.412 Mean :755.5
## 3rd Qu.:20.60 3rd Qu.:44.34 3rd Qu.:10.408 3rd Qu.:760.9
## Max. :24.50 Max. :53.33 Max. :26.100 Max. :772.3
## RH_out Windspeed Visibility Tdewpoint
## Min. : 24.00 Min. : 0.000 Min. : 1.00 Min. :-6.600
## 1st Qu.: 70.33 1st Qu.: 2.000 1st Qu.:29.00 1st Qu.: 0.900
## Median : 83.67 Median : 3.667 Median :40.00 Median : 3.433
## Mean : 79.75 Mean : 4.040 Mean :38.33 Mean : 3.761
## 3rd Qu.: 91.67 3rd Qu.: 5.500 3rd Qu.:40.00 3rd Qu.: 6.567
## Max. :100.00 Max. :14.000 Max. :66.00 Max. :15.500
## rv1 rv2
## Min. : 0.005322 Min. : 0.005322
## 1st Qu.:12.497889 1st Qu.:12.497889
## Median :24.897653 Median :24.897653
## Mean :24.988033 Mean :24.988033
## 3rd Qu.:37.583769 3rd Qu.:37.583769
## Max. :49.996530 Max. :49.996530
names(data)
## [1] "date" "Appliances" "lights" "T1" "RH_1"
## [6] "T2" "RH_2" "T3" "RH_3" "T4"
## [11] "RH_4" "T5" "RH_5" "T6" "RH_6"
## [16] "T7" "RH_7" "T8" "RH_8" "T9"
## [21] "RH_9" "T_out" "Press_mm_hg" "RH_out" "Windspeed"
## [26] "Visibility" "Tdewpoint" "rv1" "rv2"
colSums(is.na(data))
## date Appliances lights T1 RH_1 T2
## 0 0 0 0 0 0
## RH_2 T3 RH_3 T4 RH_4 T5
## 0 0 0 0 0 0
## RH_5 T6 RH_6 T7 RH_7 T8
## 0 0 0 0 0 0
## RH_8 T9 RH_9 T_out Press_mm_hg RH_out
## 0 0 0 0 0 0
## Windspeed Visibility Tdewpoint rv1 rv2
## 0 0 0 0 0
mean(data$Appliances)
## [1] 97.69496
sd(data$Appliances)
## [1] 102.5249
plot(data$Appliances)

hist(data$Appliances)

Load data from postgres Database
This section loads the energy salary prediction dataset from a
postgres database.
library(DBI)
## Warning: package 'DBI' was built under R version 4.5.3
library(RPostgres)
## Warning: package 'RPostgres' was built under R version 4.5.3
con <- dbConnect(
RPostgres::Postgres(),
dbname = "mydb",
host = "localhost",
port = 5432,
user = "admin",
password = "admin123"
)
data <- dbGetQuery(con, "SELECT * FROM salary_prediction")
summary(data$salary)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 31867 119358 143453 145718 169492 333046
head(data)
## job_title experience_years education_level skills_count
## 1 AI Engineer 10 Bachelor 2
## 2 Data Analyst 5 Bachelor 17
## 3 Frontend Developer 18 PhD 4
## 4 Business Analyst 19 PhD 13
## 5 Product Manager 15 Bachelor 7
## 6 AI Engineer 0 High School 4
## industry company_size location remote_work certifications salary
## 1 Healthcare Medium India Hybrid 2 109413
## 2 Telecom Small Australia No 0 93764
## 3 Media Medium Singapore No 1 148123
## 4 Retail Medium Canada Yes 0 189123
## 5 Manufacturing Large Sweden Yes 0 165069
## 6 Education Enterprise USA No 2 180351
tail(data)
## job_title experience_years education_level skills_count
## 249995 Business Analyst 13 Diploma 17
## 249996 Software Engineer 17 PhD 2
## 249997 Frontend Developer 20 PhD 7
## 249998 Business Analyst 1 Bachelor 12
## 249999 Data Scientist 0 High School 2
## 250000 Data Analyst 16 Diploma 2
## industry company_size location remote_work certifications salary
## 249995 Finance Large USA Yes 3 171896
## 249996 Telecom Enterprise India No 1 127791
## 249997 Telecom Startup Remote No 2 154593
## 249998 Retail Enterprise India Yes 0 75988
## 249999 Consulting Small Sweden Hybrid 5 90467
## 250000 Technology Medium UK No 5 133084
dbDisconnect(con)
Load data from My Sql Database
Assignment 2
This section demonstrates how to merge 2 to 3 datasets using common
key columns.
We load three global datasets — world population, CO2 emissions, and
GDP —
then join them by country to enable cross-dataset analysis.
load data
#library(tidyverse)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'tibble' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'stringr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ purrr 1.2.2
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## Warning: package 'janitor' was built under R version 4.5.3
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(visdat)
## Warning: package 'visdat' was built under R version 4.5.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.5.3
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.5.3
library(GGally)
## Warning: package 'GGally' was built under R version 4.5.3
library(dplyr)
library(purrr)
population <- read_csv("E:/Document/BIG Data Masters/Course Semester 1/Semester 1/R Programming for Data Science/world_population.csv",show_col_types = FALSE)
cos2 <- read_csv("E:/Document/BIG Data Masters/Course Semester 1/Semester 1/R Programming for Data Science/CO2_emission.csv",show_col_types = FALSE)
## New names:
## • `2019` -> `2019...34`
## • `2019` -> `2019...35`
gdp_Countries <- read_csv("E:/Document/BIG Data Masters/Course Semester 1/Semester 1/R Programming for Data Science/gdp_2020-2025.csv",show_col_types = FALSE)
How to use the clean data sets
library(janitor)
population <- population %>%
clean_names()
cos2 <- cos2 %>%
clean_names()
gdp_Countries <- gdp_Countries %>%
clean_names()
This section demonstrates how to merge 2 to 3 datasets using common
key columns.
using inner join function
merged_population_cos2 <- population %>%
inner_join(
cos2,
by = c("country_territory" = "country_name")
)
print(merged_population_cos2)
## # A tibble: 186 × 51
## rank cca3 country_territory capital continent x2022_population
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 36 AFG Afghanistan Kabul Asia 41128771
## 2 138 ALB Albania Tirana Europe 2842321
## 3 34 DZA Algeria Algiers Africa 44903225
## 4 213 ASM American Samoa Pago Pago Oceania 44273
## 5 203 AND Andorra Andorra la Vella Europe 79824
## 6 42 AGO Angola Luanda Africa 35588987
## 7 201 ATG Antigua and Barbuda Saint John’s North Amer… 93763
## 8 33 ARG Argentina Buenos Aires South Amer… 45510318
## 9 140 ARM Armenia Yerevan Asia 2780469
## 10 198 ABW Aruba Oranjestad North Amer… 106445
## # ℹ 176 more rows
## # ℹ 45 more variables: x2020_population <dbl>, x2015_population <dbl>,
## # x2010_population <dbl>, x2000_population <dbl>, x1990_population <dbl>,
## # x1980_population <dbl>, x1970_population <dbl>, area_km2 <dbl>,
## # density_per_km2 <dbl>, growth_rate <dbl>,
## # world_population_percentage <dbl>, country_code <chr>, region <chr>,
## # indicator_name <chr>, x1990 <dbl>, x1991 <dbl>, x1992 <dbl>, x1993 <dbl>, …
merged_all_dataset <- population %>%
inner_join(
cos2,
by = c("country_territory" = "country_name")
) %>%
inner_join(
gdp_Countries,
by = c("country_territory" = "country")
)
print(merged_all_dataset)
## # A tibble: 168 × 57
## rank cca3 country_territory capital continent x2022_population
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 36 AFG Afghanistan Kabul Asia 41128771
## 2 138 ALB Albania Tirana Europe 2842321
## 3 34 DZA Algeria Algiers Africa 44903225
## 4 203 AND Andorra Andorra la Vella Europe 79824
## 5 42 AGO Angola Luanda Africa 35588987
## 6 201 ATG Antigua and Barbuda Saint John’s North Amer… 93763
## 7 33 ARG Argentina Buenos Aires South Amer… 45510318
## 8 140 ARM Armenia Yerevan Asia 2780469
## 9 198 ABW Aruba Oranjestad North Amer… 106445
## 10 55 AUS Australia Canberra Oceania 26177413
## # ℹ 158 more rows
## # ℹ 51 more variables: x2020_population <dbl>, x2015_population <dbl>,
## # x2010_population <dbl>, x2000_population <dbl>, x1990_population <dbl>,
## # x1980_population <dbl>, x1970_population <dbl>, area_km2 <dbl>,
## # density_per_km2 <dbl>, growth_rate <dbl>,
## # world_population_percentage <dbl>, country_code <chr>, region <chr>,
## # indicator_name <chr>, x1990 <dbl>, x1991 <dbl>, x1992 <dbl>, x1993 <dbl>, …
assignment 3 using Group by and %>%
This section groups the world population data by continent and
calculates
the average 2022 population per continent. This helps compare
population
distribution across different regions of the world.
population %>%
group_by(continent) %>%
summarise(
avg_population =
mean(x2022_population,
na.rm = TRUE)
)
## # A tibble: 6 × 2
## continent avg_population
## <chr> <dbl>
## 1 Africa 25030367.
## 2 Asia 94427665.
## 3 Europe 14862951.
## 4 North America 15007403.
## 5 Oceania 1958198
## 6 South America 31201186.
population %>%
group_by(continent) %>%
summarise(
number_of_countries = n()
)
## # A tibble: 6 × 2
## continent number_of_countries
## <chr> <int>
## 1 Africa 57
## 2 Asia 50
## 3 Europe 50
## 4 North America 40
## 5 Oceania 23
## 6 South America 14
Select specific columns function help to select specific data
population %>%
select(country_territory, continent, density_per_km2, growth_rate, area_km2, x2022_population)
## # A tibble: 234 × 6
## country_territory continent density_per_km2 growth_rate area_km2
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan Asia 63.1 1.03 652230
## 2 Albania Europe 98.9 0.996 28748
## 3 Algeria Africa 18.9 1.02 2381741
## 4 American Samoa Oceania 222. 0.983 199
## 5 Andorra Europe 171. 1.01 468
## 6 Angola Africa 28.5 1.03 1246700
## 7 Anguilla North America 174. 1.01 91
## 8 Antigua and Barbuda North America 212. 1.01 442
## 9 Argentina South America 16.4 1.01 2780400
## 10 Armenia Asia 93.5 0.996 29743
## # ℹ 224 more rows
## # ℹ 1 more variable: x2022_population <dbl>
Filter — only African countries help to get all needed data like
african countries
population %>%
filter(continent == "Africa")
## # A tibble: 57 × 17
## rank cca3 country_territory capital continent x2022_population
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 34 DZA Algeria Algiers Africa 44903225
## 2 42 AGO Angola Luanda Africa 35588987
## 3 77 BEN Benin Porto-Novo Africa 13352864
## 4 144 BWA Botswana Gaborone Africa 2630296
## 5 58 BFA Burkina Faso Ouagadougou Africa 22673762
## 6 78 BDI Burundi Bujumbura Africa 12889576
## 7 53 CMR Cameroon Yaounde Africa 27914536
## 8 171 CPV Cape Verde Praia Africa 593149
## 9 117 CAF Central African Republic Bangui Africa 5579144
## 10 69 TCD Chad N'Djamena Africa 17723315
## # ℹ 47 more rows
## # ℹ 11 more variables: x2020_population <dbl>, x2015_population <dbl>,
## # x2010_population <dbl>, x2000_population <dbl>, x1990_population <dbl>,
## # x1980_population <dbl>, x1970_population <dbl>, area_km2 <dbl>,
## # density_per_km2 <dbl>, growth_rate <dbl>, world_population_percentage <dbl>
Arrange — sort by 2022 population descending
population %>%
arrange(desc(x2022_population))
## # A tibble: 234 × 17
## rank cca3 country_territory capital continent x2022_population
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 1 CHN China Beijing Asia 1425887337
## 2 2 IND India New Delhi Asia 1417173173
## 3 3 USA United States Washington, D.C. North America 338289857
## 4 4 IDN Indonesia Jakarta Asia 275501339
## 5 5 PAK Pakistan Islamabad Asia 235824862
## 6 6 NGA Nigeria Abuja Africa 218541212
## 7 7 BRA Brazil Brasilia South America 215313498
## 8 8 BGD Bangladesh Dhaka Asia 171186372
## 9 9 RUS Russia Moscow Europe 144713314
## 10 10 MEX Mexico Mexico City North America 127504125
## # ℹ 224 more rows
## # ℹ 11 more variables: x2020_population <dbl>, x2015_population <dbl>,
## # x2010_population <dbl>, x2000_population <dbl>, x1990_population <dbl>,
## # x1980_population <dbl>, x1970_population <dbl>, area_km2 <dbl>,
## # density_per_km2 <dbl>, growth_rate <dbl>, world_population_percentage <dbl>
Rename — cleaner column names
pupulation_rename <- population
print(pupulation_rename)
## # A tibble: 234 × 17
## rank cca3 country_territory capital continent x2022_population
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 36 AFG Afghanistan Kabul Asia 41128771
## 2 138 ALB Albania Tirana Europe 2842321
## 3 34 DZA Algeria Algiers Africa 44903225
## 4 213 ASM American Samoa Pago Pago Oceania 44273
## 5 203 AND Andorra Andorra la Vella Europe 79824
## 6 42 AGO Angola Luanda Africa 35588987
## 7 224 AIA Anguilla The Valley North Amer… 15857
## 8 201 ATG Antigua and Barbuda Saint John’s North Amer… 93763
## 9 33 ARG Argentina Buenos Aires South Amer… 45510318
## 10 140 ARM Armenia Yerevan Asia 2780469
## # ℹ 224 more rows
## # ℹ 11 more variables: x2020_population <dbl>, x2015_population <dbl>,
## # x2010_population <dbl>, x2000_population <dbl>, x1990_population <dbl>,
## # x1980_population <dbl>, x1970_population <dbl>, area_km2 <dbl>,
## # density_per_km2 <dbl>, growth_rate <dbl>, world_population_percentage <dbl>
pupulation_rename <- population %>%
rename(
country = country_territory,
population_2022 = x2022_population,
density = density_per_km2,
area = area_km2
)
#print(population)
Mutate — add new columns
population %>%
mutate(
pop_growth_abs = x2022_population - x2000_population,
pop_growth_pct = ((x2022_population - x2000_population) / x2000_population) * 100,
density_category = case_when(
density_per_km2 < 50 ~ "Low",
density_per_km2 < 200 ~ "Medium",
TRUE ~ "High"
)
)
## # A tibble: 234 × 20
## rank cca3 country_territory capital continent x2022_population
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 36 AFG Afghanistan Kabul Asia 41128771
## 2 138 ALB Albania Tirana Europe 2842321
## 3 34 DZA Algeria Algiers Africa 44903225
## 4 213 ASM American Samoa Pago Pago Oceania 44273
## 5 203 AND Andorra Andorra la Vella Europe 79824
## 6 42 AGO Angola Luanda Africa 35588987
## 7 224 AIA Anguilla The Valley North Amer… 15857
## 8 201 ATG Antigua and Barbuda Saint John’s North Amer… 93763
## 9 33 ARG Argentina Buenos Aires South Amer… 45510318
## 10 140 ARM Armenia Yerevan Asia 2780469
## # ℹ 224 more rows
## # ℹ 14 more variables: x2020_population <dbl>, x2015_population <dbl>,
## # x2010_population <dbl>, x2000_population <dbl>, x1990_population <dbl>,
## # x1980_population <dbl>, x1970_population <dbl>, area_km2 <dbl>,
## # density_per_km2 <dbl>, growth_rate <dbl>,
## # world_population_percentage <dbl>, pop_growth_abs <dbl>,
## # pop_growth_pct <dbl>, density_category <chr>
assignment 4: how to use trace and recover()
quadratic_equation <- function(a, b, c) {
delta <- (b * b) - (4 * a * c)
if (delta > 0) {
r1 <- (-b + sqrt(delta)) / (2 * a)
r2 <- (-b - sqrt(delta)) / (2 * a)
print(r1)
print(r2)
} else if (delta == 0) {
r1 <- -b / (2 * a)
print(paste("One real root:", r1))
} else {
print("Complex roots exist.")
}
}
Trace()
trace(
"quadratic_equation",
tracer = quote(print(paste("Inputs:", a, b, c))),
print = FALSE
)
## [1] "quadratic_equation"
quadratic_equation(1, -5, 6)
## [1] "Inputs: 1 -5 6"
## [1] 3
## [1] 2
untrace("quadratic_equation")
Using recover()
recover() is used when an error happens.
It lets you inspect the environment where the error occurred.
quadratic_equation <- function(a, b, c) {
delta <- (b * b) - (4 * a * c)
if (delta > 0) {
r1 <- (-b + sqrt(delta)) / (2 * a)
r2 <- (-b - sqrt(delta)) / (2 * a)
print(paste("Root 1:", round(r1, 2)))
print(paste("Root 2:", round(r2, 2)))
} else if (delta == 0) {
r1 <- -b / (2 * a)
print(paste("One root:", round(r1, 2)))
} else {
print("No real roots — delta is negative")
}
}
options(error = NULL)
quadratic_equation(1, -5, 6)
## [1] "Root 1: 3"
## [1] "Root 2: 2"
Debugging Workflow
options(error = NULL) # reset error handler
trace(
"quadratic_equation",
tracer = quote(print(paste("Delta =", delta))),
at = 3, # line 3 = first line AFTER delta is assigned
print = FALSE
)
## [1] "quadratic_equation"
quadratic_equation(1, -5, 6)
## [1] "Delta = 1"
## [1] "Root 1: 3"
## [1] "Root 2: 2"
untrace("quadratic_equation")
assignment 5: use #how an we use this function sapply(), vapply(),
lapply, map(), mapply() and summary statics
make function that makes summary statics
print(population)
## # A tibble: 234 × 17
## rank cca3 country_territory capital continent x2022_population
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 36 AFG Afghanistan Kabul Asia 41128771
## 2 138 ALB Albania Tirana Europe 2842321
## 3 34 DZA Algeria Algiers Africa 44903225
## 4 213 ASM American Samoa Pago Pago Oceania 44273
## 5 203 AND Andorra Andorra la Vella Europe 79824
## 6 42 AGO Angola Luanda Africa 35588987
## 7 224 AIA Anguilla The Valley North Amer… 15857
## 8 201 ATG Antigua and Barbuda Saint John’s North Amer… 93763
## 9 33 ARG Argentina Buenos Aires South Amer… 45510318
## 10 140 ARM Armenia Yerevan Asia 2780469
## # ℹ 224 more rows
## # ℹ 11 more variables: x2020_population <dbl>, x2015_population <dbl>,
## # x2010_population <dbl>, x2000_population <dbl>, x1990_population <dbl>,
## # x1980_population <dbl>, x1970_population <dbl>, area_km2 <dbl>,
## # density_per_km2 <dbl>, growth_rate <dbl>, world_population_percentage <dbl>
summary_stats <- function(data, column) {
values <- data[[column]]
cat("Summary Statistics for:", column, "\n")
cat("Count :", length(values), "\n")
cat("Mean :", round(mean(values, na.rm = TRUE), 2), "\n")
cat("Median :", round(median(values, na.rm = TRUE), 2), "\n")
cat("Std Dev :", round(sd(values, na.rm = TRUE), 2), "\n")
cat("Min :", round(min(values, na.rm = TRUE), 2), "\n")
cat("Max :", round(max(values, na.rm = TRUE), 2), "\n")
cat("Missing :", sum(is.na(values)), "\n")
}
# Run on population column
summary_stats(population, "x2022_population")
## Summary Statistics for: x2022_population
## Count : 234
## Mean : 34074415
## Median : 5559945
## Std Dev : 136766425
## Min : 510
## Max : 1425887337
## Missing : 0
summary_stats(population, "density_per_km2")
## Summary Statistics for: density_per_km2
## Count : 234
## Mean : 452.13
## Median : 95.35
## Std Dev : 2066.12
## Min : 0.03
## Max : 23172.27
## Missing : 0
summary_stats(population, "area_km2")
## Summary Statistics for: area_km2
## Count : 234
## Mean : 581449.4
## Median : 81199.5
## Std Dev : 1761841
## Min : 1
## Max : 17098242
## Missing : 0
summary_stats(population, "growth_rate")
## Summary Statistics for: growth_rate
## Count : 234
## Mean : 1.01
## Median : 1.01
## Std Dev : 0.01
## Min : 0.91
## Max : 1.07
## Missing : 0
using sapply
sapply(
population[, c("density_per_km2", "growth_rate", "x2022_population")],
mean, na.rm = TRUE
)
## density_per_km2 growth_rate x2022_population
## 4.521270e+02 1.009577e+00 3.407441e+07
using lapply
lapply(
population[, c("density_per_km2", "growth_rate", "x2022_population")],
summary
)
## $density_per_km2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.610e-02 3.842e+01 9.535e+01 4.521e+02 2.389e+02 2.317e+04
##
## $growth_rate
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.912 1.002 1.008 1.010 1.017 1.069
##
## $x2022_population
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.100e+02 4.197e+05 5.560e+06 3.407e+07 2.248e+07 1.426e+09
using vapply
vapply(
population[, c("density_per_km2", "growth_rate", "x2022_population")],
function(x) mean(x, na.rm = TRUE),
numeric(1)
)
## density_per_km2 growth_rate x2022_population
## 4.521270e+02 1.009577e+00 3.407441e+07
population_split by continent
population_split <- split(population, population$continent)
str(population_split)
## List of 6
## $ Africa : tibble [57 × 17] (S3: tbl_df/tbl/data.frame)
## ..$ rank : num [1:57] 34 42 77 144 58 78 53 171 117 69 ...
## ..$ cca3 : chr [1:57] "DZA" "AGO" "BEN" "BWA" ...
## ..$ country_territory : chr [1:57] "Algeria" "Angola" "Benin" "Botswana" ...
## ..$ capital : chr [1:57] "Algiers" "Luanda" "Porto-Novo" "Gaborone" ...
## ..$ continent : chr [1:57] "Africa" "Africa" "Africa" "Africa" ...
## ..$ x2022_population : num [1:57] 44903225 35588987 13352864 2630296 22673762 ...
## ..$ x2020_population : num [1:57] 43451666 33428485 12643123 2546402 21522626 ...
## ..$ x2015_population : num [1:57] 39543154 28127721 10932783 2305171 18718019 ...
## ..$ x2010_population : num [1:57] 35856344 23364185 9445710 2091664 16116845 ...
## ..$ x2000_population : num [1:57] 30774621 16394062 6998023 1726985 11882888 ...
## ..$ x1990_population : num [1:57] 25518074 11828638 5133419 1341474 9131361 ...
## ..$ x1980_population : num [1:57] 18739378 8330047 3833939 938578 6932967 ...
## ..$ x1970_population : num [1:57] 13795915 6029700 3023443 592244 5611666 ...
## ..$ area_km2 : num [1:57] 2381741 1246700 112622 582000 272967 ...
## ..$ density_per_km2 : num [1:57] 18.85 28.55 118.56 4.52 83.06 ...
## ..$ growth_rate : num [1:57] 1.02 1.03 1.03 1.02 1.03 ...
## ..$ world_population_percentage: num [1:57] 0.56 0.45 0.17 0.03 0.28 0.16 0.35 0.01 0.07 0.22 ...
## $ Asia : tibble [50 × 17] (S3: tbl_df/tbl/data.frame)
## ..$ rank : num [1:50] 36 140 91 154 8 165 175 73 1 131 ...
## ..$ cca3 : chr [1:50] "AFG" "ARM" "AZE" "BHR" ...
## ..$ country_territory : chr [1:50] "Afghanistan" "Armenia" "Azerbaijan" "Bahrain" ...
## ..$ capital : chr [1:50] "Kabul" "Yerevan" "Baku" "Manama" ...
## ..$ continent : chr [1:50] "Asia" "Asia" "Asia" "Asia" ...
## ..$ x2022_population : num [1:50] 4.11e+07 2.78e+06 1.04e+07 1.47e+06 1.71e+08 ...
## ..$ x2020_population : num [1:50] 3.90e+07 2.81e+06 1.03e+07 1.48e+06 1.67e+08 ...
## ..$ x2015_population : num [1:50] 3.38e+07 2.88e+06 9.86e+06 1.36e+06 1.58e+08 ...
## ..$ x2010_population : num [1:50] 2.82e+07 2.95e+06 9.24e+06 1.21e+06 1.48e+08 ...
## ..$ x2000_population : num [1:50] 1.95e+07 3.17e+06 8.19e+06 7.11e+05 1.29e+08 ...
## ..$ x1990_population : num [1:50] 1.07e+07 3.56e+06 7.43e+06 5.17e+05 1.07e+08 ...
## ..$ x1980_population : num [1:50] 12486631 3135123 6383060 362595 83929765 ...
## ..$ x1970_population : num [1:50] 10752971 2534377 5425317 222555 67541860 ...
## ..$ area_km2 : num [1:50] 652230 29743 86600 765 147570 ...
## ..$ density_per_km2 : num [1:50] 63.1 93.5 119.6 1924.5 1160 ...
## ..$ growth_rate : num [1:50] 1.026 0.996 1.004 1.006 1.011 ...
## ..$ world_population_percentage: num [1:50] 0.52 0.03 0.13 0.02 2.15 ...
## $ Europe : tibble [50 × 17] (S3: tbl_df/tbl/data.frame)
## ..$ rank : num [1:50] 138 203 99 96 81 137 108 130 158 88 ...
## ..$ cca3 : chr [1:50] "ALB" "AND" "AUT" "BLR" ...
## ..$ country_territory : chr [1:50] "Albania" "Andorra" "Austria" "Belarus" ...
## ..$ capital : chr [1:50] "Tirana" "Andorra la Vella" "Vienna" "Minsk" ...
## ..$ continent : chr [1:50] "Europe" "Europe" "Europe" "Europe" ...
## ..$ x2022_population : num [1:50] 2842321 79824 8939617 9534954 11655930 ...
## ..$ x2020_population : num [1:50] 2866849 77700 8907777 9633740 11561717 ...
## ..$ x2015_population : num [1:50] 2882481 71746 8642421 9700609 11248303 ...
## ..$ x2010_population : num [1:50] 2913399 71519 8362829 9731427 10877947 ...
## ..$ x2000_population : num [1:50] 3182021 66097 8010428 10256483 10264343 ...
## ..$ x1990_population : num [1:50] 3295066 53569 7678729 10428525 9959560 ...
## ..$ x1980_population : num [1:50] 2941651 35611 7547561 9817257 9828986 ...
## ..$ x1970_population : num [1:50] 2324731 19860 7465301 9170786 9629376 ...
## ..$ area_km2 : num [1:50] 28748 468 83871 207600 30528 ...
## ..$ density_per_km2 : num [1:50] 98.9 170.6 106.6 45.9 381.8 ...
## ..$ growth_rate : num [1:50] 0.996 1.01 1.002 0.996 1.004 ...
## ..$ world_population_percentage: num [1:50] 0.04 0 0.11 0.12 0.15 0.04 0.09 0.05 0.02 0.13 ...
## $ North America: tibble [40 × 17] (S3: tbl_df/tbl/data.frame)
## ..$ rank : num [1:40] 224 201 198 176 186 177 206 221 39 205 ...
## ..$ cca3 : chr [1:40] "AIA" "ATG" "ABW" "BHS" ...
## ..$ country_territory : chr [1:40] "Anguilla" "Antigua and Barbuda" "Aruba" "Bahamas" ...
## ..$ capital : chr [1:40] "The Valley" "Saint John’s" "Oranjestad" "Nassau" ...
## ..$ continent : chr [1:40] "North America" "North America" "North America" "North America" ...
## ..$ x2022_population : num [1:40] 15857 93763 106445 409984 281635 ...
## ..$ x2020_population : num [1:40] 15585 92664 106585 406471 280693 ...
## ..$ x2015_population : num [1:40] 14525 89941 104257 392697 278083 ...
## ..$ x2010_population : num [1:40] 13172 85695 100341 373272 274711 ...
## ..$ x2000_population : num [1:40] 11047 75055 89101 325014 264657 ...
## ..$ x1990_population : num [1:40] 8316 63328 65712 270679 258868 ...
## ..$ x1980_population : num [1:40] 6560 64888 62267 223752 253575 ...
## ..$ x1970_population : num [1:40] 6283 64516 59106 179129 241397 ...
## ..$ area_km2 : num [1:40] 91 442 180 13943 430 ...
## ..$ density_per_km2 : num [1:40] 174.3 212.1 591.4 29.4 655 ...
## ..$ growth_rate : num [1:40] 1.007 1.006 0.999 1.005 1.002 ...
## ..$ world_population_percentage: num [1:40] 0 0 0 0.01 0 0.01 0 0 0.48 0 ...
## $ Oceania : tibble [23 × 17] (S3: tbl_df/tbl/data.frame)
## ..$ rank : num [1:23] 213 55 223 162 183 191 192 215 194 225 ...
## ..$ cca3 : chr [1:23] "ASM" "AUS" "COK" "FJI" ...
## ..$ country_territory : chr [1:23] "American Samoa" "Australia" "Cook Islands" "Fiji" ...
## ..$ capital : chr [1:23] "Pago Pago" "Canberra" "Avarua" "Suva" ...
## ..$ continent : chr [1:23] "Oceania" "Oceania" "Oceania" "Oceania" ...
## ..$ x2022_population : num [1:23] 44273 26177413 17011 929766 306279 ...
## ..$ x2020_population : num [1:23] 46189 25670051 17029 920422 301920 ...
## ..$ x2015_population : num [1:23] 51368 23820236 17695 917200 291787 ...
## ..$ x2010_population : num [1:23] 54849 22019168 17212 905169 283788 ...
## ..$ x2000_population : num [1:23] 58230 19017963 15897 832509 250927 ...
## ..$ x1990_population : num [1:23] 47818 17048003 17123 780430 211089 ...
## ..$ x1980_population : num [1:23] 32886 14706322 17651 644582 163591 ...
## ..$ x1970_population : num [1:23] 27075 12595034 20470 527634 117891 ...
## ..$ area_km2 : num [1:23] 199 7692024 236 18272 4167 ...
## ..$ density_per_km2 : num [1:23] 222.5 3.4 72.1 50.9 73.5 ...
## ..$ growth_rate : num [1:23] 0.983 1.01 1 1.006 1.007 ...
## ..$ world_population_percentage: num [1:23] 0 0.33 0 0.01 0 0 0 0 0 0 ...
## $ South America: tibble [14 × 17] (S3: tbl_df/tbl/data.frame)
## ..$ rank : num [1:14] 33 80 7 65 28 67 231 184 164 109 ...
## ..$ cca3 : chr [1:14] "ARG" "BOL" "BRA" "CHL" ...
## ..$ country_territory : chr [1:14] "Argentina" "Bolivia" "Brazil" "Chile" ...
## ..$ capital : chr [1:14] "Buenos Aires" "Sucre" "Brasilia" "Santiago" ...
## ..$ continent : chr [1:14] "South America" "South America" "South America" "South America" ...
## ..$ x2022_population : num [1:14] 4.55e+07 1.22e+07 2.15e+08 1.96e+07 5.19e+07 ...
## ..$ x2020_population : num [1:14] 4.50e+07 1.19e+07 2.13e+08 1.93e+07 5.09e+07 ...
## ..$ x2015_population : num [1:14] 4.33e+07 1.11e+07 2.05e+08 1.79e+07 4.71e+07 ...
## ..$ x2010_population : num [1:14] 4.11e+07 1.02e+07 1.96e+08 1.70e+07 4.48e+07 ...
## ..$ x2000_population : num [1:14] 3.71e+07 8.59e+06 1.76e+08 1.54e+07 3.92e+07 ...
## ..$ x1990_population : num [1:14] 3.26e+07 7.10e+06 1.51e+08 1.33e+07 3.26e+07 ...
## ..$ x1980_population : num [1:14] 2.80e+07 5.74e+06 1.22e+08 1.15e+07 2.62e+07 ...
## ..$ x1970_population : num [1:14] 23842803 4585693 96369875 9820481 20905254 ...
## ..$ area_km2 : num [1:14] 2780400 1098581 8515767 756102 1141748 ...
## ..$ density_per_km2 : num [1:14] 16.4 11.1 25.3 25.9 45.4 ...
## ..$ growth_rate : num [1:14] 1.01 1.01 1 1.01 1.01 ...
## ..$ world_population_percentage: num [1:14] 0.57 0.15 2.7 0.25 0.65 0.23 0 0 0.01 0.09 ...
tapply(
population$density_per_km2,
population$continent,
mean, na.rm = TRUE
)
## Africa Asia Europe North America Oceania
## 125.04765 1025.02414 663.32474 272.76176 132.54307
## South America
## 20.97198
using map
library(purrr) # add this if not already loaded
population[, c("density_per_km2", "growth_rate", "x2022_population")] %>%
map(function(x) mean(x, na.rm = TRUE))
## $density_per_km2
## [1] 452.127
##
## $growth_rate
## [1] 1.009577
##
## $x2022_population
## [1] 34074415
print(population)
## # A tibble: 234 × 17
## rank cca3 country_territory capital continent x2022_population
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 36 AFG Afghanistan Kabul Asia 41128771
## 2 138 ALB Albania Tirana Europe 2842321
## 3 34 DZA Algeria Algiers Africa 44903225
## 4 213 ASM American Samoa Pago Pago Oceania 44273
## 5 203 AND Andorra Andorra la Vella Europe 79824
## 6 42 AGO Angola Luanda Africa 35588987
## 7 224 AIA Anguilla The Valley North Amer… 15857
## 8 201 ATG Antigua and Barbuda Saint John’s North Amer… 93763
## 9 33 ARG Argentina Buenos Aires South Amer… 45510318
## 10 140 ARM Armenia Yerevan Asia 2780469
## # ℹ 224 more rows
## # ℹ 11 more variables: x2020_population <dbl>, x2015_population <dbl>,
## # x2010_population <dbl>, x2000_population <dbl>, x1990_population <dbl>,
## # x1980_population <dbl>, x1970_population <dbl>, area_km2 <dbl>,
## # density_per_km2 <dbl>, growth_rate <dbl>, world_population_percentage <dbl>