install.packages(“haven”) ## Create a sample dataset and View Data Set
library(haven)
# Create a sample dataset
datasample <- data.frame(
ID = 1:10,
Name = c("Jean", "Alice", "Bob", "David", "Emma", "Frank", "Grace", "Helen", "Ian", "Jane"),
Age = c(25, 30, 28, 35, 22, 40, 27, 33, 29, 31),
Salary = c(500, 600, 550, 700, 450, 800, 520, 680, 590, 610)
)
# View data
print(datasample)
## ID Name Age Salary
## 1 1 Jean 25 500
## 2 2 Alice 30 600
## 3 3 Bob 28 550
## 4 4 David 35 700
## 5 5 Emma 22 450
## 6 6 Frank 40 800
## 7 7 Grace 27 520
## 8 8 Helen 33 680
## 9 9 Ian 29 590
## 10 10 Jane 31 610
write_sav(datasample, "datasample.sav")
list.files()
## [1] "datasample.sav" "loan_datasample.dta"
## [3] "R-Assignment.html" "R-Assignment.pdf"
## [5] "R-Assignment.Rmd" "R assignment I for Import Data.Rmd"
## [7] "R Assignment.Rmd" "rsconnect"
"datasample.sav"
## [1] "datasample.sav"
data_loaded <- read_sav("datasample.sav")
head(data_loaded)
## # A tibble: 6 × 4
## ID Name Age Salary
## <dbl> <chr> <dbl> <dbl>
## 1 1 Jean 25 500
## 2 2 Alice 30 600
## 3 3 Bob 28 550
## 4 4 David 35 700
## 5 5 Emma 22 450
## 6 6 Frank 40 800
write_sav(datasample, "C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/datasample.sav")
spss_data <- read_sav("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/datasample.sav")
head(spss_data)
## # A tibble: 6 × 4
## ID Name Age Salary
## <dbl> <chr> <dbl> <dbl>
## 1 1 Jean 25 500
## 2 2 Alice 30 600
## 3 3 Bob 28 550
## 4 4 David 35 700
## 5 5 Emma 22 450
## 6 6 Frank 40 800
install.packages(“DBI”) install.packages(“odbc”) install.packages(“RMySQL”) install.packages(“RPostgres”)
install.packages(“tinytex”) tinytex::install_tinytex()
library(readxl)
datainexcel <- read_excel("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/OpeningDate.xlsx")
## New names:
## • `` -> `...5`
## • `` -> `...7`
head(datainexcel)
## # A tibble: 6 × 12
## `Open Date` `Date Birth` Gender Marital ...5 Status ...7
## <dttm> <dttm> <chr> <chr> <lgl> <chr> <lgl>
## 1 2026-01-12 00:00:00 1999-09-09 00:00:00 Male Single NA Active NA
## 2 2026-02-03 00:00:00 1990-01-01 00:00:00 Male Married NA Active NA
## 3 2026-02-03 00:00:00 1987-12-22 00:00:00 Female Married NA Active NA
## 4 2026-02-03 00:00:00 1990-12-28 00:00:00 Male Single NA Active NA
## 5 2026-02-03 00:00:00 1987-06-03 00:00:00 Female Married NA Active NA
## 6 2026-02-03 00:00:00 2000-01-01 00:00:00 Female Married NA Active NA
## # ℹ 5 more variables: Education <chr>, Post <chr>, Nature <chr>, Salary <dbl>,
## # `Valid Date` <dttm>
data("iris")
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
mean(iris$Petal.Length)
## [1] 3.758
mean(iris$Petal.Length[iris$Species=="setosa"])
## [1] 1.462
mean(iris$Petal.Length[iris$Species=="versicolor"])
## [1] 4.26
mean(iris$Petal.Length[iris$Species=="virginica"])
## [1] 5.552
#Exploratory Data Analysis # Load CSV files for world_population and CO2_emission DataSets from Kaggle website
population_data <- read.csv("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/Assignment/world_population.csv")
co2_data <- read.csv("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/Assignment/CO2_emission.csv")
# View Data sets
head(population_data)
## Rank CCA3 Country.Territory Capital Continent X2022.Population
## 1 36 AFG Afghanistan Kabul Asia 41128771
## 2 138 ALB Albania Tirana Europe 2842321
## 3 34 DZA Algeria Algiers Africa 44903225
## 4 213 ASM American Samoa Pago Pago Oceania 44273
## 5 203 AND Andorra Andorra la Vella Europe 79824
## 6 42 AGO Angola Luanda Africa 35588987
## X2020.Population X2015.Population X2010.Population X2000.Population
## 1 38972230 33753499 28189672 19542982
## 2 2866849 2882481 2913399 3182021
## 3 43451666 39543154 35856344 30774621
## 4 46189 51368 54849 58230
## 5 77700 71746 71519 66097
## 6 33428485 28127721 23364185 16394062
## X1990.Population X1980.Population X1970.Population Area..km..
## 1 10694796 12486631 10752971 652230
## 2 3295066 2941651 2324731 28748
## 3 25518074 18739378 13795915 2381741
## 4 47818 32886 27075 199
## 5 53569 35611 19860 468
## 6 11828638 8330047 6029700 1246700
## Density..per.km.. Growth.Rate World.Population.Percentage
## 1 63.0587 1.0257 0.52
## 2 98.8702 0.9957 0.04
## 3 18.8531 1.0164 0.56
## 4 222.4774 0.9831 0.00
## 5 170.5641 1.0100 0.00
## 6 28.5466 1.0315 0.45
head(co2_data)
## Country.Name country_code Region
## 1 Aruba ABW Latin America & Caribbean
## 2 Afghanistan AFG South Asia
## 3 Angola AGO Sub-Saharan Africa
## 4 Albania ALB Europe & Central Asia
## 5 Andorra AND Europe & Central Asia
## 6 United Arab Emirates ARE Middle East & North Africa
## Indicator.Name X1990 X1991 X1992
## 1 CO2 emissions (metric tons per capita) NA NA NA
## 2 CO2 emissions (metric tons per capita) 0.1917451 0.1676816 0.09595774
## 3 CO2 emissions (metric tons per capita) 0.5536620 0.5445386 0.54355722
## 4 CO2 emissions (metric tons per capita) 1.8195416 1.2428102 0.68369983
## 5 CO2 emissions (metric tons per capita) 7.5218317 7.2353792 6.96307870
## 6 CO2 emissions (metric tons per capita) 30.1951886 31.7784962 29.08092584
## X1993 X1994 X1995 X1996 X1997 X1998
## 1 NA NA NA NA NA NA
## 2 0.08472111 0.07554583 0.06846796 0.06258803 0.05682662 0.05269086
## 3 0.70898423 0.83680440 0.91214149 1.07216847 1.08663697 1.09182531
## 4 0.63830704 0.64535519 0.60543625 0.61236736 0.46692147 0.57215370
## 5 6.72417752 6.54157891 6.73347949 6.99159455 7.30744115 7.63953851
## 6 29.27567777 30.84933296 31.12501806 30.92802588 30.48633262 29.66358052
## X1999 X2000 X2001 X2002 X2003 X2004
## 1 NA NA NA NA NA NA
## 2 0.04015697 0.0365737 0.03378536 0.04557366 0.05151838 0.04165539
## 3 1.10985966 0.9880774 0.94182891 0.89557767 0.92486944 0.93026295
## 4 0.95535931 1.0262131 1.05549588 1.23237878 1.33898498 1.40405869
## 5 7.92319165 7.9522863 7.72154906 7.56623988 7.24241557 7.34426233
## 6 28.88710798 27.0351591 29.43026994 28.50146173 27.96926982 27.03893822
## X2005 X2006 X2007 X2008 X2009 X2010
## 1 NA NA NA NA NA NA
## 2 0.06041878 0.06658329 0.06531235 0.1284166 0.1718624 0.2436140
## 3 0.81353929 0.82184008 0.81175351 0.8886580 0.9394040 0.9761842
## 4 1.33820940 1.33999574 1.39393137 1.3843112 1.4414936 1.5276237
## 5 7.35378001 6.79054277 6.53104692 6.4393039 6.1566875 6.1571978
## 6 25.38238104 22.93510429 21.37028576 22.0114692 19.8323489 19.0397698
## X2011 X2012 X2013 X2014 X2015 X2016 X2017
## 1 NA NA NA NA NA NA NA
## 2 0.2965062 0.2592953 0.1856237 0.1462356 0.1728967 0.1497893 0.1316946
## 3 0.9855223 0.9506959 1.0362939 1.0997791 1.1350441 1.0318113 0.8133007
## 4 1.6694232 1.5032405 1.5336300 1.6683374 1.6037751 1.5576644 1.7887861
## 5 5.8508861 5.9446542 5.9428004 5.8071277 6.0261818 6.0806003 6.1041339
## 6 18.5094574 19.2078011 20.0556476 20.0516980 21.0776420 21.4806686 20.7690223
## X2018 X2019 X2019.1
## 1 NA NA NA
## 2 0.1632953 0.1598244 0.1598244
## 3 0.7776749 0.7921371 0.7921371
## 4 1.7827389 1.6922483 1.6922483
## 5 6.3629754 6.4812174 6.4812174
## 6 18.3906781 19.3295633 19.3295633
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
merged_data2 <- population_data %>%
select(Country.Territory, Continent, X2022.Population) %>%
inner_join(
co2_data %>%
select(Country.Name, co2_2019=`X2019`),
by = c("Country.Territory" = "Country.Name")
)
head(merged_data2)
## Country.Territory Continent X2022.Population co2_2019
## 1 Afghanistan Asia 41128771 0.1598244
## 2 Albania Europe 2842321 1.6922483
## 3 Algeria Africa 44903225 3.9776505
## 4 American Samoa Oceania 44273 NA
## 5 Andorra Europe 79824 6.4812174
## 6 Angola Africa 35588987 0.7921371
#Sum 2019 CO2 emissions data Sets by continent by using Group By and %>%
continent_co2 <- merged_data2 %>%
group_by(Continent) %>%
summarise(
Total_CO2 = sum(co2_2019, na.rm = TRUE)
) %>%
arrange(desc(Total_CO2))
continent_co2
## # A tibble: 6 × 2
## Continent Total_CO2
## <chr> <dbl>
## 1 Asia 242.
## 2 Europe 229.
## 3 North America 79.6
## 4 Africa 58.0
## 5 Oceania 53.2
## 6 South America 29.4
##Using Ggplot is most popular data visualization package, built on Leland Wilkinson’s Grammar of Graphics install.packages(“ggplot2”)
library(ggplot2) # Using geom_bar() where it counts Rows by default
library(ggplot2)
top10_Population <- population_data %>%
arrange(desc(X2022.Population)) %>%
slice(1:10)
top10_Population
## Rank CCA3 Country.Territory Capital Continent X2022.Population
## 1 1 CHN China Beijing Asia 1425887337
## 2 2 IND India New Delhi Asia 1417173173
## 3 3 USA United States Washington, D.C. North America 338289857
## 4 4 IDN Indonesia Jakarta Asia 275501339
## 5 5 PAK Pakistan Islamabad Asia 235824862
## 6 6 NGA Nigeria Abuja Africa 218541212
## 7 7 BRA Brazil Brasilia South America 215313498
## 8 8 BGD Bangladesh Dhaka Asia 171186372
## 9 9 RUS Russia Moscow Europe 144713314
## 10 10 MEX Mexico Mexico City North America 127504125
## X2020.Population X2015.Population X2010.Population X2000.Population
## 1 1424929781 1393715448 1348191368 1264099069
## 2 1396387127 1322866505 1240613620 1059633675
## 3 335942003 324607776 311182845 282398554
## 4 271857970 259091970 244016173 214072421
## 5 227196741 210969298 194454498 154369924
## 6 208327405 183995785 160952853 122851984
## 7 213196304 205188205 196353492 175873720
## 8 167420951 157830000 148391139 129193327
## 9 145617329 144668389 143242599 146844839
## 10 125998302 120149897 112532401 97873442
## X1990.Population X1980.Population X1970.Population Area..km..
## 1 1153704252 982372466 822534450 9706961
## 2 870452165 696828385 557501301 3287590
## 3 248083732 223140018 200328340 9372610
## 4 182159874 148177096 115228394 1904569
## 5 115414069 80624057 59290872 881912
## 6 95214257 72951439 55569264 923768
## 7 150706446 122288383 96369875 8515767
## 8 107147651 83929765 67541860 147570
## 9 148005704 138257420 130093010 17098242
## 10 81720428 67705186 50289306 1964375
## Density..per.km.. Growth.Rate World.Population.Percentage
## 1 146.8933 1.0000 17.88
## 2 431.0675 1.0068 17.77
## 3 36.0935 1.0038 4.24
## 4 144.6529 1.0064 3.45
## 5 267.4018 1.0191 2.96
## 6 236.5759 1.0241 2.74
## 7 25.2841 1.0046 2.70
## 8 1160.0350 1.0108 2.15
## 9 8.4636 0.9973 1.81
## 10 64.9082 1.0063 1.60
ggplot(top10_Population,
aes(x = reorder(Country.Territory, X2022.Population),
y = X2022.Population)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = "Top 10 Most Populous Countries in 2022",
x = "Country",
y = "Population"
) +
theme_minimal()
# Using geom_smooth() with Different Colors where it adds a smoothed
trend line to a plot. Almost always paired with geom_point(). here on
the grap they are a Population and Area it Points on it
ggplot(top10_Population,
aes(x = X2022.Population,
y = Area..km..)) +
geom_point(color = "blue", size = 3) +
geom_smooth(method = "lm", color = "Orange", se = FALSE) +
labs(
title = "Population vs Area",
x = "Population 2022",
y = "Area (km²)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## 7. How to use trace() and recover() trace() and recover() in R
trace() is used to insert debugging code into a function temporarily
without editing the function itself. It lets you monitor when a function
is called, what arguments it receives, and what happens inside it. {r
Basic syntax for trace()} # add tracing to a function #trace(what =
function_name, tracer = code_to_run)
# trace mean() to see every time it is called
trace(mean)
# now call mean
mean(c(1, 2, 3, 4, 5))
## trace: mean(c(1, 2, 3, 4, 5))
## [1] 3
mean(mtcars$mpg)
## trace: mean(mtcars$mpg)
## [1] 20.09062
recover() pauses execution at the point of error and lets you inspect the environment of each function in the call stack interactively.
options(error = recover)
process_values <- function(values) {
results <- c()
for (i in seq_along(values)) {
result <- sqrt(values[[i]])
results <- c(results, result)
}
return(results)
}
# list with a bad value
my_values <- list(4, 9, -1, 16)
process_values(my_values)
## Warning in sqrt(values[[i]]): NaNs produced
## [1] 2 3 NaN 4
#These functions : sapply(), lapply(), vapply(), mapply() are all apply-family functions in R. They replace loops and are used to apply a function over data structures like vectors, lists, and data frames.
#lapply applies remove_outliers() to each numeric column result is assigned back to numeric columns only with the Create Functions of Outliers
remove_outliers <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_value
upper <- Q3 + 1.5 * IQR_value
x[x < lower | x > upper] <- NA
return(x)
}
population_data[sapply(population_data, is.numeric)] <-
lapply(
population_data[sapply(population_data, is.numeric)],
remove_outliers
)
summary(population_data)
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## Rank CCA3 Country.Territory Capital
## Min. : 1.00 Length :234 Length :234 Length :234
## 1st Qu.: 59.25 N.unique :234 N.unique :234 N.unique :234
## Median :117.50 N.blank : 0 N.blank : 0 N.blank : 0
## Mean :117.50 Min.nchar: 3 Min.nchar: 4 Min.nchar: 4
## 3rd Qu.:175.75 Max.nchar: 3 Max.nchar: 32 Max.nchar: 19
## Max. :234.00
##
## Continent X2022.Population X2020.Population X2015.Population
## Length :234 Min. : 510 Min. : 520 Min. : 564
## N.unique : 6 1st Qu.: 326101 1st Qu.: 304670 1st Qu.: 280558
## N.blank : 0 Median : 4030358 Median : 3931390 Median : 3771132
## Min.nchar: 4 Mean : 9985056 Mean : 9517226 Mean : 8630101
## Max.nchar: 13 3rd Qu.:12889576 3rd Qu.:12176349 3rd Qu.:11221301
## Max. :54179306 Max. :51985780 Max. :47119728
## NAs :25 NAs :26 NAs :27
## X2010.Population X2000.Population X1990.Population X1980.Population
## Min. : 596 Min. : 651 Min. : 700 Min. : 733
## 1st Qu.: 271390 1st Qu.: 221537 1st Qu.: 172725 1st Qu.: 146000
## Median : 3650902 Median : 3139954 Median : 2392030 Median : 1944578
## Mean : 8161724 Mean : 6401596 Mean : 5100338 Mean : 4091130
## 3rd Qu.:10660788 3rd Qu.: 8871043 3rd Qu.: 7682565 3rd Qu.: 6192514
## Max. :46572772 Max. :38504431 Max. :27657204 Max. :22125224
## NAs :26 NAs :29 NAs :31 NAs :32
## X1970.Population Area..km.. Density..per.km.. Growth.Rate
## Min. : 752 Min. : 1 Min. : 0.0261 Min. :0.9816
## 1st Qu.: 117891 1st Qu.: 1110 1st Qu.: 30.5126 1st Qu.:1.0018
## Median : 1741286 Median : 46949 Median : 82.0961 Median :1.0079
## Mean : 3581916 Mean : 166204 Mean :127.1829 Mean :1.0096
## 3rd Qu.: 5202918 3rd Qu.: 242563 3rd Qu.:176.3380 3rd Qu.:1.0165
## Max. :21434577 Max. :1030700 Max. :525.2334 Max. :1.0378
## NAs :29 NAs :28 NAs :22 NAs :3
## World.Population.Percentage
## Min. :0.000
## 1st Qu.:0.000
## Median :0.050
## Mean :0.125
## 3rd Qu.:0.160
## Max. :0.680
## NAs :25
num_cols <- sapply(population_data, is.numeric)
population_data[num_cols] <- lapply(
population_data[num_cols],
remove_outliers
)
num_cols
## Rank CCA3
## TRUE FALSE
## Country.Territory Capital
## FALSE FALSE
## Continent X2022.Population
## FALSE TRUE
## X2020.Population X2015.Population
## TRUE TRUE
## X2010.Population X2000.Population
## TRUE TRUE
## X1990.Population X1980.Population
## TRUE TRUE
## X1970.Population Area..km..
## TRUE TRUE
## Density..per.km.. Growth.Rate
## TRUE TRUE
## World.Population.Percentage
## TRUE
num_cols <- vapply(population_data, is.numeric, FUN.VALUE = logical(1))
population_data[num_cols] <- lapply(
population_data[num_cols],
remove_outliers
)