library(readxl)
my_data <- read_excel("D:/AUCA/R for data science/archive/E-Commerce Orders.csv.xlsx")
head(my_data)
## # A tibble: 6 × 14
## OrderID Date CustomerID Product Quantity UnitPrice
## <chr> <dttm> <chr> <chr> <dbl> <dbl>
## 1 ORD200000 2023-01-04 00:00:00 C72649 Monitor 5 571.
## 2 ORD200001 2024-08-23 00:00:00 C75739 Phone 2 151.
## 3 ORD200002 2024-02-27 00:00:00 C81728 Tablet 5 551.
## 4 ORD200003 2023-10-15 00:00:00 C33540 Chair 1 273.
## 5 ORD200004 2025-05-08 00:00:00 C81840 Printer 4 626.
## 6 ORD200005 2023-10-23 00:00:00 C37249 Phone 2 246.
## # ℹ 8 more variables: ShippingAddress <chr>, PaymentMethod <chr>,
## # OrderStatus <chr>, TrackingNumber <chr>, ItemsInCart <dbl>,
## # CouponCode <chr>, ReferralSource <chr>, TotalPrice <dbl>
library(DBI)
library(RPostgres)
con <- DBI::dbConnect(
RPostgres::Postgres(),
dbname = "hw1",
host = "localhost",
port = 5432,
user = "postgres",
password = "ian"
)
con
## <PqConnection> hw1@localhost:5432
df <- DBI::dbGetQuery(con, "SELECT * FROM hw1 LIMIT 10")
df
## orderid date customerid product quantity unitprice shippingaddress
## 1 ORD200000 2023-01-04 C72649 Monitor 5 570.62 928 Main St
## 2 ORD200001 2024-08-23 C75739 Phone 2 151.35 823 Main St
## 3 ORD200002 2024-02-27 C81728 Tablet 5 550.68 512 Main St
## 4 ORD200003 2023-10-15 C33540 Chair 1 273.19 275 Main St
## 5 ORD200004 2025-05-08 C81840 Printer 4 626.01 668 Main St
## 6 ORD200005 2023-10-23 C37249 Phone 2 245.86 934 Main St
## 7 ORD200006 2025-06-17 C83492 Laptop 1 664.42 986 Main St
## 8 ORD200007 2023-05-12 C41460 Monitor 5 149.55 706 Main St
## 9 ORD200008 2025-04-02 C26817 Phone 2 134.28 904 Main St
## 10 ORD200009 2023-11-21 C31946 Desk 4 509.38 102 Main St
## paymentmethod orderstatus trackingnumber itemsincart couponcode
## 1 Debit Card Shipped TRK37947903 7 SAVE10
## 2 Online Shipped TRK91186779 3 SAVE10
## 3 Credit Card Cancelled TRK42903982 8 FREESHIP
## 4 Debit Card Returned TRK62788070 5 SAVE10
## 5 Online Delivered TRK29241424 8 SAVE10
## 6 Credit Card Shipped TRK72976927 4 SAVE10
## 7 Gift Card Returned TRK96417362 6 SAVE10
## 8 Cash Shipped TRK78809193 9 FREESHIP
## 9 Gift Card Cancelled TRK61042692 2 <NA>
## 10 Credit Card Shipped TRK33478363 6 SAVE10
## referralsource totalprice
## 1 Instagram 2853.10
## 2 Referral 302.70
## 3 Email 2753.40
## 4 Facebook 273.19
## 5 Email 2504.04
## 6 Instagram 491.72
## 7 Facebook 664.42
## 8 Facebook 747.75
## 9 Email 268.56
## 10 Google 2037.52
In this analysis, merging was used to combine two separate datasets—World Population data and CO2 emissions data—into a single unified dataset based on a common key, the country name. This was done using a join operation (merge()), matching Country.Territory from the population dataset with Country.Name from the CO2 dataset. The purpose of merging was to bring related variables together, specifically the 2022 population figures and 2019 CO2 emissions, so they could be analyzed in relation to each other. After merging, the dataset contained only countries present in both datasets, which allowed direct comparison and statistical analysis between population size and CO2 emission levels. This step is essential in data analysis because it integrates multiple sources into one structure, enabling correlation analysis, visualization, and regression between variables that originally existed in separate tables.
setwd("D:\\AUCA\\R for data science")
getwd()
## [1] "D:/AUCA/R for data science"
population <- read.csv("world_population.csv")
co2 <- read.csv("CO2_emission.csv")
head(population)
## Rank CCA3 Country.Territory Capital Continent X2022.Population
## 1 36 AFG Afghanistan Kabul Asia 41128771
## 2 138 ALB Albania Tirana Europe 2842321
## 3 34 DZA Algeria Algiers Africa 44903225
## 4 213 ASM American Samoa Pago Pago Oceania 44273
## 5 203 AND Andorra Andorra la Vella Europe 79824
## 6 42 AGO Angola Luanda Africa 35588987
## X2020.Population X2015.Population X2010.Population X2000.Population
## 1 38972230 33753499 28189672 19542982
## 2 2866849 2882481 2913399 3182021
## 3 43451666 39543154 35856344 30774621
## 4 46189 51368 54849 58230
## 5 77700 71746 71519 66097
## 6 33428485 28127721 23364185 16394062
## X1990.Population X1980.Population X1970.Population Area..km..
## 1 10694796 12486631 10752971 652230
## 2 3295066 2941651 2324731 28748
## 3 25518074 18739378 13795915 2381741
## 4 47818 32886 27075 199
## 5 53569 35611 19860 468
## 6 11828638 8330047 6029700 1246700
## Density..per.km.. Growth.Rate World.Population.Percentage
## 1 63.0587 1.0257 0.52
## 2 98.8702 0.9957 0.04
## 3 18.8531 1.0164 0.56
## 4 222.4774 0.9831 0.00
## 5 170.5641 1.0100 0.00
## 6 28.5466 1.0315 0.45
head(co2)
## Country.Name country_code Region
## 1 Aruba ABW Latin America & Caribbean
## 2 Afghanistan AFG South Asia
## 3 Angola AGO Sub-Saharan Africa
## 4 Albania ALB Europe & Central Asia
## 5 Andorra AND Europe & Central Asia
## 6 United Arab Emirates ARE Middle East & North Africa
## Indicator.Name X1990 X1991 X1992
## 1 CO2 emissions (metric tons per capita) NA NA NA
## 2 CO2 emissions (metric tons per capita) 0.1917451 0.1676816 0.09595774
## 3 CO2 emissions (metric tons per capita) 0.5536620 0.5445386 0.54355722
## 4 CO2 emissions (metric tons per capita) 1.8195416 1.2428102 0.68369983
## 5 CO2 emissions (metric tons per capita) 7.5218317 7.2353792 6.96307870
## 6 CO2 emissions (metric tons per capita) 30.1951886 31.7784962 29.08092584
## X1993 X1994 X1995 X1996 X1997 X1998
## 1 NA NA NA NA NA NA
## 2 0.08472111 0.07554583 0.06846796 0.06258803 0.05682662 0.05269086
## 3 0.70898423 0.83680440 0.91214149 1.07216847 1.08663697 1.09182531
## 4 0.63830704 0.64535519 0.60543625 0.61236736 0.46692147 0.57215370
## 5 6.72417752 6.54157891 6.73347949 6.99159455 7.30744115 7.63953851
## 6 29.27567777 30.84933296 31.12501806 30.92802588 30.48633262 29.66358052
## X1999 X2000 X2001 X2002 X2003 X2004
## 1 NA NA NA NA NA NA
## 2 0.04015697 0.0365737 0.03378536 0.04557366 0.05151838 0.04165539
## 3 1.10985966 0.9880774 0.94182891 0.89557767 0.92486944 0.93026295
## 4 0.95535931 1.0262131 1.05549588 1.23237878 1.33898498 1.40405869
## 5 7.92319165 7.9522863 7.72154906 7.56623988 7.24241557 7.34426233
## 6 28.88710798 27.0351591 29.43026994 28.50146173 27.96926982 27.03893822
## X2005 X2006 X2007 X2008 X2009 X2010
## 1 NA NA NA NA NA NA
## 2 0.06041878 0.06658329 0.06531235 0.1284166 0.1718624 0.2436140
## 3 0.81353929 0.82184008 0.81175351 0.8886580 0.9394040 0.9761842
## 4 1.33820940 1.33999574 1.39393137 1.3843112 1.4414936 1.5276237
## 5 7.35378001 6.79054277 6.53104692 6.4393039 6.1566875 6.1571978
## 6 25.38238104 22.93510429 21.37028576 22.0114692 19.8323489 19.0397698
## X2011 X2012 X2013 X2014 X2015 X2016 X2017
## 1 NA NA NA NA NA NA NA
## 2 0.2965062 0.2592953 0.1856237 0.1462356 0.1728967 0.1497893 0.1316946
## 3 0.9855223 0.9506959 1.0362939 1.0997791 1.1350441 1.0318113 0.8133007
## 4 1.6694232 1.5032405 1.5336300 1.6683374 1.6037751 1.5576644 1.7887861
## 5 5.8508861 5.9446542 5.9428004 5.8071277 6.0261818 6.0806003 6.1041339
## 6 18.5094574 19.2078011 20.0556476 20.0516980 21.0776420 21.4806686 20.7690223
## X2018 X2019 X2019.1
## 1 NA NA NA
## 2 0.1632953 0.1598244 0.1598244
## 3 0.7776749 0.7921371 0.7921371
## 4 1.7827389 1.6922483 1.6922483
## 5 6.3629754 6.4812174 6.4812174
## 6 18.3906781 19.3295633 19.3295633
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
pop_2022 <- population %>%
select(Country.Territory, X2022.Population)
co2_2019 <- co2 %>%
select(Country.Name, X2019)
co2_2019$X2019 <- as.numeric(co2_2019$X2019)
pop_2022$X2022.Population <- as.numeric(pop_2022$X2022.Population)
merged_data <- merge(
pop_2022,
co2_2019,
by.x = "Country.Territory",
by.y = "Country.Name"
)
dim(merged_data)
## [1] 186 3
head(merged_data)
## Country.Territory X2022.Population X2019
## 1 Afghanistan 41128771 0.1598244
## 2 Albania 2842321 1.6922483
## 3 Algeria 44903225 3.9776505
## 4 American Samoa 44273 NA
## 5 Andorra 79824 6.4812174
## 6 Angola 35588987 0.7921371
options(scipen = 999)
In this analysis, we used the tidyverse pipeline operator %>% to streamline data processing and improve readability by linking multiple transformation steps. The group_by() function was used to partition the dataset into meaningful groups based on categorical variables such as country, allowing calculations to be performed within each group independently. After grouping, the summarise() function was applied to compute aggregated statistics, such as total or average CO2 emissions per country. The pipeline approach enabled a clear workflow where data is first grouped, then summarized, and finally arranged or visualized. This method helped transform raw data into structured summaries suitable for analysis and interpretation.
wrangle_data <- function(data) {
data %>%
# 1. select important columns
select(Country.Territory, X2022.Population, X2019) %>%
# 2. filter (remove very small countries for clarity)
filter(X2022.Population > 1000000) %>%
# 3. mutate (create new variable)
mutate(
co2_per_million = X2019 / (X2022.Population / 1e6)
) %>%
# 4. rename columns
rename(
Country = Country.Territory,
Population_2022 = X2022.Population,
CO2_2019 = X2019
) %>%
# 5. arrange (sort by CO2)
arrange(desc(CO2_2019))
}
clean_data <- wrangle_data(merged_data)
head(clean_data)
## Country Population_2022 CO2_2019 co2_per_million
## 1 Qatar 2695122 32.47447 12.0493502
## 2 Kuwait 4268873 22.02242 5.1588362
## 3 Bahrain 1472233 20.26610 13.7655540
## 4 United Arab Emirates 9441129 19.32956 2.0473784
## 5 Canada 38454327 15.43061 0.4012712
## 6 Saudi Arabia 36408820 15.28458 0.4198043
merged_data %>%
group_by(Country.Territory) %>%
summarise(
population = mean(X2022.Population, na.rm = TRUE),
co2 = mean(X2019, na.rm = TRUE)
)
## # A tibble: 186 × 3
## Country.Territory population co2
## <chr> <dbl> <dbl>
## 1 Afghanistan 41128771 0.160
## 2 Albania 2842321 1.69
## 3 Algeria 44903225 3.98
## 4 American Samoa 44273 NaN
## 5 Andorra 79824 6.48
## 6 Angola 35588987 0.792
## 7 Antigua and Barbuda 93763 5.35
## 8 Argentina 45510318 3.74
## 9 Armenia 2780469 2.09
## 10 Aruba 106445 NaN
## # ℹ 176 more rows
country_summary <- merged_data %>%
group_by(Country.Territory) %>%
summarise(
population = mean(X2022.Population, na.rm = TRUE),
co2 = mean(X2019, na.rm = TRUE)
) %>%
arrange(desc(co2))
country_summary %>%
top_n(10, co2)
## # A tibble: 10 × 3
## Country.Territory population co2
## <chr> <dbl> <dbl>
## 1 Qatar 2695122 32.5
## 2 Kuwait 4268873 22.0
## 3 Bahrain 1472233 20.3
## 4 United Arab Emirates 9441129 19.3
## 5 Canada 38454327 15.4
## 6 Luxembourg 647599 15.3
## 7 Saudi Arabia 36408820 15.3
## 8 Oman 4576298 15.3
## 9 Australia 26177413 15.2
## 10 United States 338289857 14.7
country_summary <- merged_data %>%
group_by(Country.Territory) %>%
summarise(
population = mean(X2022.Population, na.rm = TRUE),
co2 = mean(X2019, na.rm = TRUE)
) %>%
arrange(desc(co2))
colnames(merged_data)
## [1] "Country.Territory" "X2022.Population" "X2019"
merged_data %>%
group_by(Country.Territory) %>%
summarise(total_co2 = sum(X2019, na.rm = TRUE)) %>%
arrange(desc(total_co2)) %>%
head(10) %>%
ggplot(aes(x = reorder(Country.Territory, total_co2), y = total_co2)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_minimal()
merged_data %>%
ggplot(aes(x = X2022.Population, y = X2019)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
theme_minimal() +
labs(title = "Population vs CO2 Emissions",
x = "Population (2022)",
y = "CO2 Emissions (2019)")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 18 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).
merged_data %>%
ggplot(aes(x = X2022.Population, y = X2019)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 18 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).
trace() was used to inspect and monitor the execution of functions by inserting a custom message or action when the function runs. This helps reveal the internal flow of a function and confirms when and how it is being executed during analysis. It is mainly used for debugging and understanding function behavior.
recover() was used to enable interactive debugging when an error occurs in R. When an error happens, it pauses execution and allows inspection of the call stack, letting the user enter the function where the error occurred to examine variables and identify the source of the problem. This improves error diagnosis and helps locate issues during code execution.
co2_per_capita <- function(co2, population) {
result <- co2 / population
return(result)
}
trace("co2_per_capita", tracer = quote(cat("Function is running\n")), print = FALSE)
## [1] "co2_per_capita"
co2_per_capita(1000, 50000)
## Function is running
## [1] 0.02
safe_co2_ratio <- function(co2, population) {
result <- co2 / population
if (population == 0) stop("Population cannot be zero")
return(result)
}
options(error = recover)
trace("safe_co2_ratio",
tracer = quote(cat("Function executing...\n")),
print = FALSE)
## [1] "safe_co2_ratio"
safe_co2_ratio(1000, 50000)
## Function executing...
## [1] 0.02
A custom summary statistics function was created to automate the calculation of important descriptive measures for numerical variables. The function computes the mean, median, standard deviation, minimum value, and maximum value while handling missing values appropriately. This approach improves code reusability and allows quick statistical exploration of variables such as population and CO2 emissions. By applying the function to different dataset variables, it becomes easier to compare distributions and understand the overall characteristics of the data.
A custom two-sample t-test function was developed to statistically compare the mean values of two independent groups. In this analysis, countries were divided into high-population and low-population categories, and the function was used to test if there was a significant difference in CO2 emissions between the two groups. The function internally applies Welch’s t-test, which is suitable when group variances may not be equal. This method provides statistical evidence about group differences using the t-statistic, p-value, confidence interval, and group means.
summary_stats <- function(x) {
result <- c(
mean = mean(x, na.rm = TRUE),
median = median(x, na.rm = TRUE),
sd = sd(x, na.rm = TRUE),
min = min(x, na.rm = TRUE),
max = max(x, na.rm = TRUE)
)
return(result)
}
summary_stats(population$X2022.Population)
## mean median sd min max
## 34074415 5559945 136766425 510 1425887337
summary_stats(merged_data$X2019)
## mean median sd min max
## 4.11549190 2.89605304 4.78699274 0.04468071 32.47446876
A Welch two-sample t-test was conducted to compare CO2 emissions between high-population and low-population countries. The results showed no statistically significant difference in mean CO2 emissions between the two groups (p = 0.0985 > 0.05). This suggests that population size alone may not be a strong predictor of CO2 emissions, as other economic and industrial factors also influence emission levels.
merged_data$pop_group <- ifelse(
merged_data$X2022.Population > median(merged_data$X2022.Population, na.rm = TRUE),
"High",
"Low"
)
two_sample_test <- function(data, group_var, value_var) {
group1 <- data[[value_var]][data[[group_var]] == "High"]
group2 <- data[[value_var]][data[[group_var]] == "Low"]
test <- t.test(group1, group2)
return(test)
}
two_sample_test(merged_data, "pop_group", "X2019")
##
## Welch Two Sample t-test
##
## data: group1 and group2
## t = -1.6637, df = 133.66, p-value = 0.09851
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.7738120 0.2392745
## sample estimates:
## mean of x mean of y
## 3.549747 4.817016
#lapply, sapply and yapply
The apply family functions in R were used to automate repetitive statistical operations across multiple variables. lapply() was used to apply functions to dataset columns and return results as lists, while sapply() simplified the outputs into vectors for easier interpretation. The apply() function was used for row-wise and column-wise operations on numerical data. These functions improved efficiency and reduced repetitive coding during statistical analysis of population and CO2 emission variables.
summary_sapply <- function(data) {
numeric_data <- data %>% select(where(is.numeric))
sapply(numeric_data, mean, na.rm = TRUE)
}
summary_sapply(merged_data)
## X2022.Population X2019
## 38621321.661290 4.115492
summary_vapply <- function(data) {
numeric_data <- data %>% select(where(is.numeric))
vapply(numeric_data, mean, FUN.VALUE = numeric(1), na.rm = TRUE)
}
summary_vapply(merged_data)
## X2022.Population X2019
## 38621321.661290 4.115492
compare_values <- function(x, y) {
mapply(function(a, b) a - b, x, y)
}
mapply(
function(pop, co2) pop / co2,
merged_data$X2022.Population,
merged_data$X2019
)
## [1] 257337291.461 1679612.245 11288881.580 NA 12316.205
## [6] 44927814.128 17511.142 12166418.793 1332880.213 NA
## [11] 1717873.348 1225615.067 2924029.006 72645.097 307596247.534
## [16] 64668.127 1557429.549 1439788.663 247184.897 21586186.327
## [21] NA 568654.041 6299795.180 506590.556 835782.177
## [26] 104632308.237 NA 1208719.570 92152440.318 206422559.983
## [31] 17085520.776 76598866.638 2492080.349 NA 110308490.422
## [36] 125614050.640 4066221.110 187470306.987 32222475.866 2225010.882
## [41] 3173610.575 991793.250 4904407.103 NA 208623.222
## [46] 1163053.830 1151580.703 2598120.055 30724.108 4428418.211
## [51] 7959867.456 5242592.513 424513.110 14640330.694 172838.108
## [56] 1437163.556 753173440.988 NA 507637.977 751506.056
## [61] 14461837.757 NA 988623.136 1377815.580 10537643.655
## [66] 50811587.499 NA 1855721.969 NA 42573.656
## [71] NA 15343382.738 44810392.765 12256416.238 222122.988
## [76] 39302026.438 9949104.759 2099869.979 81983.892 788360765.448
## [81] 120285400.827 10020240.558 693307.067 NA 1306190.705
## [86] 11115414.749 993550.684 14512583.922 4628763.008 1693122.393
## [91] 127488295.933 171488.138 193842.171 467875.366 1346549.268
## [96] 6364277.800 22187559.717 812857.774 9968.083 655075.083
## [101] 42308.961 193836747.520 262155528.105 4281276.826 131804.923
## [106] 76182730.194 161933.261 13577.128 5426413.810 394424.983
## [111] 36206304.879 984475.122 NA 475088.627 150024.068
## [116] 19112455.075 133487169.327 79742466.155 1517410.749 2272.639
## [121] 64975986.876 2081765.870 NA 759146.936 8679527.230
## [126] 284151982.803 380975276.167 523827.034 NA 808405.347
## [131] 299448.196 267993304.588 1300.032 1403356.408 11743117.705
## [136] 5818257.417 19506041.173 85915622.714 5127216.538 2366685.037
## [141] NA 82992.027 5149863.637 130795117.598 146099.779
## [146] NA 325984.082 2382062.381 26572045.609 1091492.407
## [151] 17143.270 74709175.283 719348.961 325518.404 1347592.351
## [156] 393850299.156 7977622.584 71013329.893 9341063.059 20017081.577
## [161] 97324748.968 135587.010 3098158.450 2005136.238 9848211.871
## [166] 305158805.443 18690132.308 2797510.760 30176525.516 69789.629
## [171] 124244.261 4831205.004 524389.164 NA 13184.136
## [176] 356948732.525 10085328.734 488429.504 12931471.822 23054616.680
## [181] 1825699.925 9963230.941 466587.846 28147378.474 52578877.010
## [186] 20324998.233
library(purrr)
summary_map <- function(data) {
numeric_data <- data %>% select(where(is.numeric))
map(numeric_data, ~mean(.x, na.rm = TRUE))
}
summary_map(merged_data)
## $X2022.Population
## [1] 38621322
##
## $X2019
## [1] 4.115492
map_dbl(merged_data %>% select(where(is.numeric)), mean, na.rm = TRUE)
## X2022.Population X2019
## 38621321.661290 4.115492
map_df(merged_data %>% select(where(is.numeric)), ~data.frame(mean = mean(.x, na.rm = TRUE)))
## mean
## 1 38621321.661290
## 2 4.115492
analysis_pipeline <- function(data) {
library(dplyr)
library(purrr)
library(ggplot2)
numeric_data <- data %>% select(where(is.numeric))
basic_summary <- sapply(numeric_data, mean, na.rm = TRUE)
safe_summary <- vapply(
numeric_data,
mean,
FUN.VALUE = numeric(1),
na.rm = TRUE
)
variance_summary <- map(numeric_data, ~var(.x, na.rm = TRUE))
relationship <- cor(
data$X2022.Population,
data$X2019,
use = "complete.obs"
)
plot <- ggplot(data, aes(x = X2022.Population, y = X2019)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
theme_minimal()
list(
mean_sapply = basic_summary,
mean_vapply = safe_summary,
variance_map = variance_summary,
relationship_mapply = relationship,
plot = plot
)
}
result <- analysis_pipeline(merged_data)
result$mean_sapply
## X2022.Population X2019
## 38621321.661290 4.115492
result$variance_map
## $X2022.Population
## [1] 23172560636485492
##
## $X2019
## [1] 22.9153
result$relationship_mapply
## [1] 0.008170163
result$plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 18 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).
visualization_suite <- function(data) {
library(ggplot2)
# Base plot data
p_base <- ggplot(data, aes(x = X2022.Population, y = X2019))
# 1. Scatter plot (geom_point)
p_point <- p_base + geom_point() + ggtitle("Geom Point")
# 2. Smooth line
p_smooth <- p_base + geom_smooth(method = "lm") + ggtitle("Geom Smooth")
# 3. Jitter
p_jitter <- p_base + geom_jitter() + ggtitle("Geom Jitter")
# 4. Line plot (sorted)
p_line <- data %>%
arrange(X2022.Population) %>%
ggplot(aes(X2022.Population, X2019)) +
geom_line() +
ggtitle("Geom Line")
# 5. Histogram (population)
p_hist <- ggplot(data, aes(X2022.Population)) +
geom_histogram(bins = 30) +
ggtitle("Geom Histogram")
# 6. Density plot
p_density <- ggplot(data, aes(X2022.Population)) +
geom_density() +
ggtitle("Geom Density")
# 7. Boxplot (CO2 distribution)
p_box <- ggplot(data, aes(y = X2019)) +
geom_boxplot() +
ggtitle("Geom Boxplot")
# 8. Rug plot
p_rug <- p_base + geom_point() + geom_rug() + ggtitle("Geom Rug")
# 9. Horizontal line (mean CO2)
p_hline <- ggplot(data, aes(X2022.Population, X2019)) +
geom_point() +
geom_hline(yintercept = mean(data$X2019, na.rm = TRUE), color = "red") +
ggtitle("Geom Hline")
# 10. Vertical line (mean population)
p_vline <- ggplot(data, aes(X2022.Population, X2019)) +
geom_point() +
geom_vline(xintercept = mean(data$X2022.Population, na.rm = TRUE), color = "blue") +
ggtitle("Geom Vline")
# 11. Text (top country label)
top <- data[which.max(data$X2019), ]
p_text <- p_base +
geom_point() +
geom_text(
aes(label = Country.Territory),
data = top,
vjust = -1
) +
ggtitle("Geom Text")
# 12. Violin (CO2 distribution not meaningful by group here, so simplified)
p_violin <- ggplot(data, aes(x = "", y = X2019)) +
geom_violin() +
ggtitle("Geom Violin")
# Return all plots
list(
point = p_point,
smooth = p_smooth,
jitter = p_jitter,
line = p_line,
hist = p_hist,
density = p_density,
box = p_box,
rug = p_rug,
hline = p_hline,
vline = p_vline,
text = p_text,
violin = p_violin
)
}
plots <- visualization_suite(merged_data)
plots$point
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).
for (p in plots) print(p)
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 18 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_rug()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18 rows containing non-finite outside the scale range
## (`stat_ydensity()`).