library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
In this task, we are required to import datasets from different sources into RStudio. Therefore, I am going to import datasets from Excel sheet, SPSS file, and CSV file. This process is important because data can come in different formats, and RStudio provides tools and packages that make it easy to load and analyze such data efficiently. By importing datasets from these sources, we can prepare the data for cleaning, analysis, and visualation.
dataset_1<-read.csv("C:\\Users\\NISR\\Desktop\\Women's waiting time dataset for dieudonne.csv")
#variable.names(dataset_1)
#library(dplyr)
#count(dataset_1)
library(readxl)
dataset_2<- read_excel("C:\\Users\\NISR\\Desktop\\exercise.xlsx")
#variable.names(dataset_2)
#count(dataset_2)
library(haven)
dataset_3<- read_spss("C:\\Users\\NISR\\Desktop\\reberaha\\URGENT\\Waiting time Dataset.sav")
#variable.names(dataset_3)
#count(dataset_3)
library(DBI)
library(RSQLite)
connection<-dbConnect(RSQLite::SQLite(),"auca_database.db")
#dbGetQuery(connection,"select * from gishushu")
Data analysis often requires combining information from multiple datasets in order to create a complete and meaningful dataset for analysis. In RStudio, merging datasets is an important technique that allows researchers and analysts to integrate data based on common variables such as IDs, names, dates, or other related fields. Depending on the structure of the data, datasets can be merged using two, three, or even more variables to ensure accuracy and consistency. This task focuses on understanding how to merge datasets in RStudio using different variables, which helps improve data organization, reduce redundancy, and prepare datasets for further statistical analysis and visualization.
library(magrittr)
Kinyarwanda_and_Maths_marks <- data.frame(
Names = c("Claude", "Mugisha", "Patrick", "Aimable"),
Kinyarwanda = c(92, 78, 87, 83),
Maths = c(67, 75, 81, 77)
)
Kinyarwanda_and_Geoagraphy_marks<- data.frame(
Names = c("Claude", "Mugisha", "Patrick", "Aimable"),
Kinyarwanda = c(84, 83, 65, 78),
Geography = c(90, 80, 79, 89)
)
#Their is Difference Way we can make The Marging Their is
#merge
#inner_join
#left_join
#right_join
#full_join
merge(Kinyarwanda_and_Maths_marks,Kinyarwanda_and_Geoagraphy_marks , by = c("Names","Kinyarwanda"))
## [1] Names Kinyarwanda Maths Geography
## <0 rows> (or 0-length row.names)
inner_join(Kinyarwanda_and_Maths_marks, Kinyarwanda_and_Geoagraphy_marks, by = c("Names","Kinyarwanda"))
## [1] Names Kinyarwanda Maths Geography
## <0 rows> (or 0-length row.names)
merge(Kinyarwanda_and_Maths_marks, Kinyarwanda_and_Geoagraphy_marks, by = c("Names","Kinyarwanda"), all.x = TRUE)
## Names Kinyarwanda Maths Geography
## 1 Aimable 83 77 NA
## 2 Claude 92 67 NA
## 3 Mugisha 78 75 NA
## 4 Patrick 87 81 NA
left_join(Kinyarwanda_and_Maths_marks, Kinyarwanda_and_Geoagraphy_marks, by = c("Names","Kinyarwanda"))
## Names Kinyarwanda Maths Geography
## 1 Claude 92 67 NA
## 2 Mugisha 78 75 NA
## 3 Patrick 87 81 NA
## 4 Aimable 83 77 NA
merge(Kinyarwanda_and_Maths_marks, Kinyarwanda_and_Geoagraphy_marks, by = c("Names","Kinyarwanda"), all.y = TRUE)
## Names Kinyarwanda Maths Geography
## 1 Aimable 78 NA 89
## 2 Claude 84 NA 90
## 3 Mugisha 83 NA 80
## 4 Patrick 65 NA 79
right_join(Kinyarwanda_and_Maths_marks, Kinyarwanda_and_Geoagraphy_marks, by = c("Names","Kinyarwanda"))
## Names Kinyarwanda Maths Geography
## 1 Claude 84 NA 90
## 2 Mugisha 83 NA 80
## 3 Patrick 65 NA 79
## 4 Aimable 78 NA 89
merge(Kinyarwanda_and_Maths_marks, Kinyarwanda_and_Geoagraphy_marks, by = c("Names","Kinyarwanda"), all = TRUE)
## Names Kinyarwanda Maths Geography
## 1 Aimable 78 NA 89
## 2 Aimable 83 77 NA
## 3 Claude 84 NA 90
## 4 Claude 92 67 NA
## 5 Mugisha 78 75 NA
## 6 Mugisha 83 NA 80
## 7 Patrick 65 NA 79
## 8 Patrick 87 81 NA
full_join(Kinyarwanda_and_Maths_marks, Kinyarwanda_and_Geoagraphy_marks, by = c("Names","Kinyarwanda"))
## Names Kinyarwanda Maths Geography
## 1 Claude 92 67 NA
## 2 Mugisha 78 75 NA
## 3 Patrick 87 81 NA
## 4 Aimable 83 77 NA
## 5 Claude 84 NA 90
## 6 Mugisha 83 NA 80
## 7 Patrick 65 NA 79
## 8 Aimable 78 NA 89
In data analysis, organizing and summarizing data is an important step for understanding patterns and making decisions. In RStudio, the group_by() function and the pipe operator %>% from the dplyr package are widely used for data manipulation. The group_by() function helps to arrange data into groups based on one or more variables, while %>% allows commands to be connected in a clear and readable sequence. This task focuses on learning how to use group_by() together with %>% to perform operations such as summarizing, counting, and analyzing grouped data efficiently. These tools make data analysis simpler, faster, and easier to understand.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Classss <- data.frame(
Names = c("Claude", "Mugisha", "Patrick", "Aimable"),
Kinyarwanda = c(84, 83, 65, 78),
Geography = c(90, 80, 79, 89)
)
# This %>% We can Use it When you want to Count the Row of Row You have in your Table
Classss %>% tally()
## n
## 1 4
# This is How we Group the Data with same Variable like here i use Country
Classss %>%
group_by(Names) %>%
summarise(
average_gdp = mean(Geography),
max_gdp = max(Geography),
min_gdp = min(Geography)
)
## # A tibble: 4 × 4
## Names average_gdp max_gdp min_gdp
## <chr> <dbl> <dbl> <dbl>
## 1 Aimable 89 89 89
## 2 Claude 90 90 90
## 3 Mugisha 80 80 80
## 4 Patrick 79 79 79
In R programming, debugging is an important process used to identify and fix errors in code. Two useful debugging tools in R are trace() and recover(). The trace() function allows programmers to monitor and inspect how a function is executed by inserting debugging code into a function. On the other hand, recover() helps users investigate errors by allowing them to browse function calls after an error occurs. These tools are helpful for understanding program behavior, locating mistakes, and improving code efficiency.
The trace() function is used to temporarily modify a function to print messages or run debugging commands.
# Create a function
add_numbers <- function(x, y) {
result <- x + y
return(result)
}
# Trace the function
trace(add_numbers)
# Run the function
add_numbers(5, 6)
## trace: add_numbers(5, 6)
## [1] 11
The recover() function is used for error handling and debugging. It allows you to inspect the environment where an error occurred.
options(error = recover)
add_one <- function(x) x + 1
double <- function(x) add_one(x) * 2
double(5) # "a" causes a crash
## [1] 12
options(error = NULL)
In data analysis, summary statistics are important because they help describe and understand the main characteristics of a dataset. In R programming, functions can be created to automate calculations such as the mean, median, minimum, maximum, standard deviation, and other statistical measures. Creating a custom function allows users to save time, reduce repetition, and improve the efficiency of data analysis. In this task, the objective is to create a personal R function that calculates summary statistics for a dataset or selected variables, making data interpretation easier and more organized.
# ── Mean ──────────────────────────────────────────────
my_mean <- function(data) {
sum(data) / length(data)
}
# ── Median ────────────────────────────────────────────
my_median <- function(data) {
sorted_data <- sort(data)
n <- length(sorted_data)
mid <- floor(n / 2)
if (n %% 2 == 0) {
return((sorted_data[mid] + sorted_data[mid + 1]) / 2)
} else {
return(sorted_data[mid + 1])
}
}
# ── Minimum ───────────────────────────────────────────
my_min <- function(data) {
minimum <- data[1]
for (x in data) {
if (x < minimum) minimum <- x
}
minimum
}
# ── Maximum ───────────────────────────────────────────
my_max <- function(data) {
maximum <- data[1]
for (x in data) {
if (x > maximum) maximum <- x
}
maximum
}
# ── Standard Deviation (Population) ──────────────────
my_sd <- function(data) {
mean_val <- my_mean(data)
variance <- sum((data - mean_val)^2) / length(data)
sqrt(variance)
}
# ── All-in-one Summary Function ───────────────────────
summary_statistics <- function(data) {
cat("================================\n")
cat(" SUMMARY STATISTICS \n")
cat("================================\n")
cat(sprintf("Mean: %.4f\n", my_mean(data)))
cat(sprintf("Median: %.4f\n", my_median(data)))
cat(sprintf("Minimum: %.4f\n", my_min(data)))
cat(sprintf("Maximum: %.4f\n", my_max(data)))
cat("================================\n")
}
# ── Test it ───────────────────────────────────────────
data <- c(8, 7, 7, 16, 6, 30, 5, 32, 13, 34)
summary_statistics(data)
## ================================
## SUMMARY STATISTICS
## ================================
## Mean: 15.8000
## Median: 10.5000
## Minimum: 5.0000
## Maximum: 34.0000
## ================================
numbers <- list(1, 2, 3, 4)
result <- lapply(numbers, function(x) x * 2)
print(result)
## [[1]]
## [1] 2
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 6
##
## [[4]]
## [1] 8
sapply() simplifies the result into vectors, matrices, or arrays whenever possible.
numbers <- list(1, 2, 3, 4)
result <- sapply(numbers, function(x) x * 2)
print(result)
## [1] 2 4 6 8
numbers <- list(1, 2, 3)
result <- sapply(numbers, function(x) c(x, x^2))
print(result)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 1 4 9
vapply() requires specifying the expected output type.
vapply(X, FUN, FUN.VALUE)
numbers <- list(1, 2, 3, 4)
result <- vapply(
numbers,
function(x) x * 2,
numeric(1)
)
print(result)
## [1] 2 4 6 8
#### Example — Character Output
words <- list("caw", "goat", "bird")
result <- vapply(
words,
toupper,
character(1)
)
print(result)
## [1] "CAW" "GOAT" "BIRD"