Q1. Importing Data from Different Sources in R
#loading all libraries that will be required
library(readxl)
library(haven)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
#A.Import Data from Excel
data <- read_excel("D:/AUCA Notes/R programming/datsets/power consumption.xlsx")
head(data)
## # A tibble: 6 × 3
## Date operator `Meter Reading(in kwh)`
## <chr> <chr> <dbl>
## 1 11/2/2026@12h57 Emmanuel 111236.
## 2 12/02/2026@12h58 Emmanuel 111506.
## 3 13/02/2026@13:04 Emmanuel 111761
## 4 16/02/2026@13:11 Emmanuel 112628.
## 5 17/02/2026@09:38 Emmanuel 112889.
## 6 24/02/2026@12:27 Emmanuel 115056.
#B.Import Data from Statistical Packages (SPSS, SAS, Stata)
#data <- read_sav("data.sav") # read data from the spss
##data <- read_sas("data.sas7bdat") # read sas file
#C.Import Data from Text Files
#1.Importing CSV file
data <- read.csv("D:/AUCA Notes/R programming/datsets/world_population.csv")
head(data)
## Rank CCA3 Country.Territory Capital Continent X2022.Population
## 1 36 AFG Afghanistan Kabul Asia 41128771
## 2 138 ALB Albania Tirana Europe 2842321
## 3 34 DZA Algeria Algiers Africa 44903225
## 4 213 ASM American Samoa Pago Pago Oceania 44273
## 5 203 AND Andorra Andorra la Vella Europe 79824
## 6 42 AGO Angola Luanda Africa 35588987
## X2020.Population X2015.Population X2010.Population X2000.Population
## 1 38972230 33753499 28189672 19542982
## 2 2866849 2882481 2913399 3182021
## 3 43451666 39543154 35856344 30774621
## 4 46189 51368 54849 58230
## 5 77700 71746 71519 66097
## 6 33428485 28127721 23364185 16394062
## X1990.Population X1980.Population X1970.Population Area..km..
## 1 10694796 12486631 10752971 652230
## 2 3295066 2941651 2324731 28748
## 3 25518074 18739378 13795915 2381741
## 4 47818 32886 27075 199
## 5 53569 35611 19860 468
## 6 11828638 8330047 6029700 1246700
## Density..per.km.. Growth.Rate World.Population.Percentage
## 1 63.0587 1.0257 0.52
## 2 98.8702 0.9957 0.04
## 3 18.8531 1.0164 0.56
## 4 222.4774 0.9831 0.00
## 5 170.5641 1.0100 0.00
## 6 28.5466 1.0315 0.45
#importing TEXT file
data <- read.table("D:/AUCA Notes/R programming/datsets/assignment.txt", header = TRUE)
head(data)
## X10.03.2026.12.12 Emmanuel X119230.7
## 1 11/03/2026@17:15 Emmanuel 119548.2
## 2 16/03/2026@12:23 Emmanuel 120885.9
## 3 17/03/2026@12:13 Emmanuel 121165.1
## 4 18/03/2026@12:54 Emmanuel 121449.8
## 5 19/03/2026@12:36 Emmanuel 121708.2
## 6 23/03/2026@12:09 Emmanuel 122785.1
Q2.A. Merging Data sets by 2 to 3 Variables
#creating dataset 1
data1 <- data.frame(
ID = c(1,2,3),
Name = c("emmy","peter","kevin"),
Age = c(20,19,22)
)
#creating dataset 2
data2 <- data.frame(
ID = c(1,2,3),
Name = c("emmy","peter","kevin"),
Marks = c(80,90,70)
)
#melging data1 and data2 by two variable(ID and name)
merged_data <- merge(data1, data2,by = c("ID","Name"))
head(merged_data)
## ID Name Age Marks
## 1 1 emmy 20 80
## 2 2 peter 19 90
## 3 3 kevin 22 70
Q2.B Merging datasets by 3 Variables(ID,Name and Age)
#creating dataset 1
data1 <- data.frame(
ID = c(1,2,3),
Name = c("emmy","peter","kevine"),
Age = c(20,21,22),
Gender = c("M","M","F")
)
#creating dataset 2
data2 <- data.frame(
ID = c(1,2,3),
Name = c("emmy","peter","kevine"),
Age = c(20,21,22),
Marks = c(80,90,70)
)
merged_data <- merge(data1, data2,by = c("ID","Name","Age")) #melging datasets
head(merged_data)
## ID Name Age Gender Marks
## 1 1 emmy 20 M 80
## 2 2 peter 21 M 90
## 3 3 kevine 22 F 70
Q3.show how to use group by,$ and %>%`
# Reading dataset
co2 <- read.csv("D:/AUCA Notes/R programming/datsets/CO2_emission.csv")
# Using $ to select and view only one column called (Country.Name)
co2$Country.Name
## [1] "Aruba" "Afghanistan"
## [3] "Angola" "Albania"
## [5] "Andorra" "United Arab Emirates"
## [7] "Argentina" "Armenia"
## [9] "American Samoa" "Antigua and Barbuda"
## [11] "Australia" "Austria"
## [13] "Azerbaijan" "Burundi"
## [15] "Belgium" "Benin"
## [17] "Burkina Faso" "Bangladesh"
## [19] "Bulgaria" "Bahrain"
## [21] "Bahamas, The" "Bosnia and Herzegovina"
## [23] "Belarus" "Belize"
## [25] "Bermuda" "Bolivia"
## [27] "Brazil" "Barbados"
## [29] "Brunei Darussalam" "Bhutan"
## [31] "Botswana" "Central African Republic"
## [33] "Canada" "Switzerland"
## [35] "Chile" "China"
## [37] "Cote d'Ivoire" "Cameroon"
## [39] "Congo, Dem. Rep." "Congo, Rep."
## [41] "Colombia" "Comoros"
## [43] "Cabo Verde" "Costa Rica"
## [45] "Cuba" "Curacao"
## [47] "Cayman Islands" "Cyprus"
## [49] "Czech Republic" "Germany"
## [51] "Djibouti" "Dominica"
## [53] "Denmark" "Dominican Republic"
## [55] "Algeria" "Ecuador"
## [57] "Egypt, Arab Rep." "Eritrea"
## [59] "Spain" "Estonia"
## [61] "Ethiopia" "Finland"
## [63] "Fiji" "France"
## [65] "Faroe Islands" "Micronesia, Fed. Sts."
## [67] "Gabon" "United Kingdom"
## [69] "Georgia" "Ghana"
## [71] "Gibraltar" "Guinea"
## [73] "Gambia, The" "Guinea-Bissau"
## [75] "Equatorial Guinea" "Greece"
## [77] "Grenada" "Greenland"
## [79] "Guatemala" "Guam"
## [81] "Guyana" "Hong Kong SAR, China"
## [83] "Honduras" "Croatia"
## [85] "Haiti" "Hungary"
## [87] "Indonesia" "Isle of Man"
## [89] "India" "Ireland"
## [91] "Iran, Islamic Rep." "Iraq"
## [93] "Iceland" "Israel"
## [95] "Italy" "Jamaica"
## [97] "Jordan" "Japan"
## [99] "Kazakhstan" "Kenya"
## [101] "Kyrgyz Republic" "Cambodia"
## [103] "Kiribati" "St. Kitts and Nevis"
## [105] "Korea, Rep." "Kuwait"
## [107] "Lao PDR" "Lebanon"
## [109] "Liberia" "Libya"
## [111] "St. Lucia" "Liechtenstein"
## [113] "Sri Lanka" "Lesotho"
## [115] "Lithuania" "Luxembourg"
## [117] "Latvia" "Macao SAR, China"
## [119] "St. Martin (French part)" "Morocco"
## [121] "Monaco" "Moldova"
## [123] "Madagascar" "Maldives"
## [125] "Mexico" "Marshall Islands"
## [127] "North Macedonia" "Mali"
## [129] "Malta" "Myanmar"
## [131] "Montenegro" "Mongolia"
## [133] "Northern Mariana Islands" "Mozambique"
## [135] "Mauritania" "Mauritius"
## [137] "Malawi" "Malaysia"
## [139] "Namibia" "New Caledonia"
## [141] "Niger" "Nigeria"
## [143] "Nicaragua" "Netherlands"
## [145] "Norway" "Nepal"
## [147] "Nauru" "New Zealand"
## [149] "Oman" "Pakistan"
## [151] "Panama" "Peru"
## [153] "Philippines" "Palau"
## [155] "Papua New Guinea" "Poland"
## [157] "Puerto Rico" "Korea, Dem. People's Rep."
## [159] "Portugal" "Paraguay"
## [161] "West Bank and Gaza" "French Polynesia"
## [163] "Qatar" "Romania"
## [165] "Russian Federation" "Rwanda"
## [167] "Saudi Arabia" "Sudan"
## [169] "Senegal" "Singapore"
## [171] "Solomon Islands" "Sierra Leone"
## [173] "El Salvador" "San Marino"
## [175] "Somalia" "Serbia"
## [177] "South Sudan" "Sao Tome and Principe"
## [179] "Suriname" "Slovak Republic"
## [181] "Slovenia" "Sweden"
## [183] "Eswatini" "Sint Maarten (Dutch part)"
## [185] "Seychelles" "Syrian Arab Republic"
## [187] "Turks and Caicos Islands" "Chad"
## [189] "Togo" "Thailand"
## [191] "Tajikistan" "Turkmenistan"
## [193] "Timor-Leste" "Tonga"
## [195] "Trinidad and Tobago" "Tunisia"
## [197] "Turkiye" "Tuvalu"
## [199] "Tanzania" "Uganda"
## [201] "Ukraine" "Uruguay"
## [203] "United States" "Uzbekistan"
## [205] "St. Vincent and the Grenadines" "Venezuela, RB"
## [207] "British Virgin Islands" "Virgin Islands (U.S.)"
## [209] "Vietnam" "Vanuatu"
## [211] "Samoa" "Yemen, Rep."
## [213] "South Africa" "Zambia"
## [215] "Zimbabwe"
Q4. how to use trace() and recover() ##it is used to see what happens inside the function
population <- read.csv("D:/AUCA Notes/R programming/datsets/world_population.csv")
trace(mean)
# Watch what mean() is doing
mean(population$X2022.Population, na.rm = TRUE)
## trace: mean(population$X2022.Population, na.rm = TRUE)
## [1] 34074415
# Calculate average population for 2022
untrace(mean)
# Stop watching mean()
# USING recover()
options(error = recover)
# Turn on recover mode
mean(population$X2022.Population, na.rm = "TRUE")
## [1] 34074415
# Intentional error for practice
options(error = NULL)
# Turn off recover mode
Q5.creating the summary()function
# Read imported population dataset
population <- read.csv("D:/AUCA Notes/R programming/datsets/world_population.csv")
# Create our own summary() function
my_summary <- function(x){
# Remove missing values
x <- x[!is.na(x)]
# Count total values
n <- length(x)
# Find minimum value
minimum <- min(x)
# Find maximum value
maximum <- max(x)
# Find average (mean)
average <- mean(x)
# Find median
middle <- median(x)
# Find first quartile (Q1)
q1 <- quantile(x, 0.25)
# Find third quartile (Q3)
q3 <- quantile(x, 0.75)
# Find total sum
total <- sum(x)
# Create results
result <- list(
Count = n,
Minimum = minimum,
Maximum = maximum,
Mean = average,
Median = middle,
Q1 = q1,
Q3 = q3,
Sum = total
)
return(result)
}
# Apply function to imported population data
my_summary(population$X2022.Population)
## $Count
## [1] 234
##
## $Minimum
## [1] 510
##
## $Maximum
## [1] 1425887337
##
## $Mean
## [1] 34074415
##
## $Median
## [1] 5559945
##
## $Q1
## 25%
## 419738.5
##
## $Q3
## 75%
## 22476505
##
## $Sum
## [1] 7973413042
# This shows summary statistics for 2022 population.
Q6. Apply Functions (lapply, sapply, vapply, mapply)
Q7.Apply geom_bar(),geom_barplot(),geom_smooth
# Read dataset
population <- read.csv("D:/AUCA Notes/R programming/datsets/world_population.csv")
# Load packages
# Check column names
colnames(population)
## [1] "Rank" "CCA3"
## [3] "Country.Territory" "Capital"
## [5] "Continent" "X2022.Population"
## [7] "X2020.Population" "X2015.Population"
## [9] "X2010.Population" "X2000.Population"
## [11] "X1990.Population" "X1980.Population"
## [13] "X1970.Population" "Area..km.."
## [15] "Density..per.km.." "Growth.Rate"
## [17] "World.Population.Percentage"
# geom_bar()
# Number of countries in each continent
ggplot(population, aes(x = Continent)) +
geom_bar() +
ggtitle("Number of Countries by Continent")
# 2. geom_col() (Barplot)
# Average population by continent
continent_pop <- population %>%
group_by(Continent) %>%
summarise(
Average_Population = mean(X2022.Population,
na.rm = TRUE))
ggplot(continent_pop,
aes(x = Continent,
y = Average_Population)) +
geom_col() +
ggtitle("Average Population by Continent")
# 3. geom_smooth()
# Relationship between area and 2022 population
ggplot(population,
aes(x = `Area..km..`,
y = X2022.Population)) +
geom_point() +
geom_smooth() +
ggtitle("Relationship Between Area and Population")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'