#ASSIGNMENT ONE
###Import Data from statistical packages (SPSS,STATA AND SAS)
install.packages("haven")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.6'
## (as 'lib' is unspecified)
## Error in `contrib.url()`:
## ! trying to use CRAN without setting a mirror
library(haven)
#data1 <- read_spss("C:/Users/HP/Desktop/data_spss.sav")
#data_stata <- read_stata("C:/Users/HP/Desktop/data_stata.dta")
#Import data from Excel and CSV
install.packages("readxl")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.6'
## (as 'lib' is unspecified)
## Error in `contrib.url()`:
## ! trying to use CRAN without setting a mirror
library(readxl)
data2 <- read.csv("C:/Users/HP/Desktop/My_Masters/HW1_Fish/Fish.csv")
head(data2)
## Species Weight Length1 Length2 Length3 Height Width
## 1 Bream 242 23.2 25.4 30.0 11.5200 4.0200
## 2 Bream 290 24.0 26.3 31.2 12.4800 4.3056
## 3 Bream 340 23.9 26.5 31.1 12.3778 4.6961
## 4 Bream 363 26.3 29.0 33.5 12.7300 4.4555
## 5 Bream 430 26.5 29.0 34.0 12.4440 5.1340
## 6 Bream 450 26.8 29.7 34.7 13.6024 4.9274
#data3 <- read_excel("C:/Users/HP/Desktop/FEB INTAKE.xlsx")
#data_sas <- read_sas("C:/Users/HP/Desktop/crimestats.sas7bdat")
#database management systems
#chooseCRANmirror()
install.packages("odbc")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.6'
## (as 'lib' is unspecified)
## Error in `contrib.url()`:
## ! trying to use CRAN without setting a mirror
install.packages("DBI")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.6'
## (as 'lib' is unspecified)
## Error in `contrib.url()`:
## ! trying to use CRAN without setting a mirror
library(DBI)
library(odbc)
con <- dbConnect(
odbc(),
Driver = "ODBC Driver 17 for SQL Server",
Server = "localhost\\SQLEXPRESS",
Database = "GradeDB",
Trusted_Connection = "Yes"
)
con <- dbConnect(
odbc(),
Driver = "SQL Server",
Server = "localhost\\SQLEXPRESS",
Database = "GradeDB",
Trusted_Connection = "True"
)
# Display first 5 tables only
head(dbListTables(con), 10)
## [1] "21stGraduation" "23953" "A1_2017"
## [4] "A12018All" "A1All" "A1All2"
## [7] "A1andProvA0_May2015B" "A1andProvA0_May2015C" "A1andProvAO_May2015"
## [10] "A1ApplicationMay2019"
students_data <- dbReadTable(con, "v_course")
head(students_data)
## COURSECODE CourseName
## 1 1 Zero Credit 1
## 2 ACCT 229 Fiscalité ACCT 229
## 3 ACCT 1100 Initiation à La Comptabilité ACCT 1100
## 4 ACCT 1101 Comptabilité Générale ACCT 1101
## 5 ACCT 1102 Initiation à La Comptabilité ACCT 1102
## 6 ACCT 1103 Comptabilité Internationale ACCT 1103
#ASSIGNMENT TWO
###MERGING DATASET WITH 2 OR 3 VARIABLES In R, merging means combining two data frames based on one or more common variables (columns).
# First dataset
data1 <- data.frame(
ID = c(1,2,3,4),
Name = c("John","jackson","Paul","Fabrice"),
Year = c(2023,2023,2024,2024),
Score = c(80,90,75,88)
)
# Second dataset
data2 <- data.frame(
ID = c(1,2,3,5),
Name = c("John","jackson","Paul","Fabrice"),
Year = c(2023,2023,2024,2024),
Grade = c("A","A","B","C")
)
head(data1)
## ID Name Year Score
## 1 1 John 2023 80
## 2 2 jackson 2023 90
## 3 3 Paul 2024 75
## 4 4 Fabrice 2024 88
head(data2)
## ID Name Year Grade
## 1 1 John 2023 A
## 2 2 jackson 2023 A
## 3 3 Paul 2024 B
## 4 5 Fabrice 2024 C
#Merge Using TWO Variables
merge(data1, data2, by = c("ID", "Name"))
## ID Name Year.x Score Year.y Grade
## 1 1 John 2023 80 2023 A
## 2 2 jackson 2023 90 2023 A
## 3 3 Paul 2024 75 2024 B
# merging by using three variables
merge(data1, data2, by = c("ID", "Name", "Year"))
## ID Name Year Score Grade
## 1 1 John 2023 80 A
## 2 2 jackson 2023 90 A
## 3 3 Paul 2024 75 B
# merging using left and right join
install.packages("dplyr")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.6'
## (as 'lib' is unspecified)
## Error in `contrib.url()`:
## ! trying to use CRAN without setting a mirror
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
left_join(data1, data2, by="ID")
## ID Name.x Year.x Score Name.y Year.y Grade
## 1 1 John 2023 80 John 2023 A
## 2 2 jackson 2023 90 jackson 2023 A
## 3 3 Paul 2024 75 Paul 2024 B
## 4 4 Fabrice 2024 88 <NA> NA <NA>
right_join(data1, data2, by="ID")
## ID Name.x Year.x Score Name.y Year.y Grade
## 1 1 John 2023 80 John 2023 A
## 2 2 jackson 2023 90 jackson 2023 A
## 3 3 Paul 2024 75 Paul 2024 B
## 4 5 <NA> NA NA Fabrice 2024 C
#ASSIGNMENT THREE Give examples by using this function .select(), filter(), arrange(), rename(), mutate(), group_by(), and %>%
# lets use Select()
#1. When you only need certain columns
#If your dataset has many variables but you only want a few:
library(dplyr)
students <-data.frame(
ID = c(40,20,30,10),
Names = c("Jackson","Fabrice","Giddy","Paul"),
Age = c(30,28,29,40),
Marks = c(60,48,50,49)
)
select(students, ID:Age) #Select a range of columns
## ID Names Age
## 1 40 Jackson 30
## 2 20 Fabrice 28
## 3 30 Giddy 29
## 4 10 Paul 40
#select(students, -Age) #Exclude columns
#select(students, contains("a")) #Select columns containing text
#select(students, starts_with("M")) #Select columns starts with
#select(students,Student_Names = Names) #Rename while selecting
#Filter()
#the filter() function is used to select rows based on conditions.
filter(students, Marks < 50 )
## ID Names Age Marks
## 1 20 Fabrice 28 48
## 2 10 Paul 40 49
#filter(students, Age == 20 & Marks > 80) #Multiple conditions
# arrange()
#Used to sort data. by default its ascending order
arrange(students, desc(Marks))
## ID Names Age Marks
## 1 40 Jackson 30 60
## 2 30 Giddy 29 50
## 3 10 Paul 40 49
## 4 20 Fabrice 28 48
#mutate()
#mutate() is used to create new columns or change existing columns in a data frame.
# in simple words: add or modify variables (columns)
#Used to create a new column.
mutate(students, Bonus = Marks + 5)
## ID Names Age Marks Bonus
## 1 40 Jackson 30 60 65
## 2 20 Fabrice 28 48 53
## 3 30 Giddy 29 50 55
## 4 10 Paul 40 49 54
#group_by() and %>%
#organize data and pass it step-by-step through operations.
students %>%
group_by(Age) %>%
summarise(total_marks = sum(Marks))
## # A tibble: 4 × 2
## Age total_marks
## <dbl> <dbl>
## 1 28 48
## 2 29 50
## 3 30 60
## 4 40 49
#ASSIGNMENT FOUR Use online tto show how this 1.trace(), 2.recover()
trace(), the trace() function is used to temporarily insert code into a function so you can debug or understand what the function is doing internally. Helps you see what happens inside a function
recover(), Used to debug errors interactively when a function crashes When error happens, R will show:
Enter a frame number, or 0 to exit
#trace()
#trace() = “peek inside a function and modify or observe how it runs”
div <- function(x, y){
if (y == 0){
stop("Cannot divide by zero")
}
results <- x / y
if(results >= 0){
cat("Answer is positive\n",results)
} else {
cat("Answer is negative\n",results)
}
}
trace(
div,
tracer = quote(cat("Tracing: x=" ,x,"y= ",y))
)
## [1] "div"
#3 — Use recover()
options(error= recover)
div(4,0)
## Tracing div(4, 0) on entry
## Tracing: x= 4 y= 0
## Error in div(4, 0): Cannot divide by zero
#ASSIGNMENT FIVE
Make functions that calculate summary statistics and apply it to a variable to show that it works
and then make a function to calculate two sample t test, then apply it to a function ##STEP ONE: Function to Calculate Summary Statistics We create a function that computes:
.Mean .Median .Minimum .Maximum .Standard Deviation .1st Quartile (Q1) .3rd Quartile (Q3
# create a simple data set
marks <- c(65, 75, 99, 80, 97, 64, 79, 98)
#Create Function for Summary Statistics
# Function to Calculate Summary Statistics
# =========================================
summary_statistics <- function(x){
# Sort data
sorted_x <- sort(x)
# Number of observations
n <- length(x)
# Mean
mean_value <- sum(x) / n
# Median
if(n %% 2 == 0){
median_value <- (sorted_x[n/2] +
sorted_x[(n/2) + 1]) / 2
} else {
median_value <- sorted_x[(n + 1)/2]
}
# First Quartile (Q1)
Q1_position <- (n + 1) * 0.25
Q1 <- sorted_x[round(Q1_position)]
# Third Quartile (Q3)
Q3_position <- (n + 1) * 0.75
Q3 <- sorted_x[round(Q3_position)]
# Minimum and Maximum
min_value <- sorted_x[1]
max_value <- sorted_x[n]
# Variance
variance_value <- sum((x - mean_value)^2) / (n - 1)
# Standard Deviation
sd_value <- sqrt(variance_value)
# Display Results
cat("SUMMARY STATISTICS\n")
cat("----------------------\n")
cat("Mean :", mean_value, "\n")
cat("1st Quartile (Q1) :", Q1, "\n")
cat("Median :", median_value, "\n")
cat("3rd Quartile (Q3) :", Q3, "\n")
cat("Minimum :", min_value, "\n")
cat("Maximum :", max_value, "\n")
cat("Standard Deviation :", sd_value, "\n")
}
summary_statistics(marks)
## SUMMARY STATISTICS
## ----------------------
## Mean : 82.125
## 1st Quartile (Q1) : 65
## Median : 79.5
## 3rd Quartile (Q3) : 98
## Minimum : 64
## Maximum : 99
## Standard Deviation : 14.367
##Create Function for Two Sample t-Test
t- Test Calculates formula:
\[ t = \frac{\bar{x}_1 - \bar{x}_2} {\sqrt{\frac{s_1^2}{n_1} + \frac{s_2^2}{n_2}}} \]
#Create Two Sample Groups
# Group A data
data4 <- c(79, 74, 95, 82, 92)
# Group B data
data5 <- c(60, 70, 72, 78, 74)
#A t-test is a statistical test used to compare averages (means) and determine whether the difference between them is statistically significant.
# Function for Two Sample t-Test
# =========================================
two_sample_ttest <- function(x, y){
# Sample sizes
n1 <- length(x)
n2 <- length(y)
# Means
mean1 <- sum(x) / n1
mean2 <- sum(y) / n2
# Variances
var1 <- sum((x - mean1)^2) / (n1 - 1)
var2 <- sum((y - mean2)^2) / (n2 - 1)
# Standard Error
SE <- sqrt((var1/n1) + (var2/n2))
# t-statistic
t_value <- (mean1 - mean2) / SE
# Degrees of freedom
df <- n1 + n2 - 2
# Display results
cat("TWO SAMPLE t-TEST\n")
cat("----------------------\n")
cat("Mean of Data 4 :", mean1, "\n")
cat("Mean of Data 5 :", mean2, "\n")
cat("Variance of Data 4 :", var1, "\n")
cat("Variance of Data 5 :", var2, "\n")
cat("Standard Error :", SE, "\n")
cat("t-statistic :", t_value, "\n")
cat("Degrees of Freedom :", df, "\n")
}
two_sample_ttest(data4, data5)
## TWO SAMPLE t-TEST
## ----------------------
## Mean of Data 4 : 84.4
## Mean of Data 5 : 70.8
## Variance of Data 4 : 78.3
## Variance of Data 5 : 45.2
## Standard Error : 4.969909
## t-statistic : 2.736468
## Degrees of Freedom : 8
#ASSIGNMENT SIX
##Apply Family Functions in R
These functions help apply operations to data efficiently.
sapply() and vapply(), variantss of lapply() that produces vectors, matrices, and arrays as output, instead of lists map(), and mapply which iterare over multiple input data structures in parallel
1.lapply() 2.sapply() 3.vapply() 4.mapply() 5.map() from purr
###lapply
#lapply() applies a function to each element of a list and returns a LIST.
# Create data frame
data1 <- data.frame(
ID = c(1,2,3,4),
Name = c("John","Jackson","Paul","Fabrice"),
Year = c(2023,2023,2024,2024),
Age = c(20,22,35,23),
Score = c(80,90,75,88)
)
# Display dataset
data1
## ID Name Year Age Score
## 1 1 John 2023 20 80
## 2 2 Jackson 2023 22 90
## 3 3 Paul 2024 35 75
## 4 4 Fabrice 2024 23 88
#lapply() applies a function to each element of a list and returns a LIST.
#Select Only Age and Score Columns
# Select Age and Score columns
selected_data <- data1[, c("Age", "Score")]
# Display selected data
selected_data
## Age Score
## 1 20 80
## 2 22 90
## 3 35 75
## 4 23 88
#Calculate Mean Using lapply()
# Calculate mean
result_lapply <- lapply(
selected_data,
mean
)
# Display result
result_lapply
## $Age
## [1] 25
##
## $Score
## [1] 83.25
##sapply()
#Calculate Mean Using sapply()
#Simplifies output into vector/matrix.
# Calculate mean
result_sapply <- sapply(
selected_data,
mean
)
# Display result
result_sapply
## Age Score
## 25.00 83.25
##vapply()
#Safer version of sapply.
#Calculate Mean Using vapply()
# Calculate mean
result_vapply <- vapply(
selected_data,
mean,
numeric(1)
)
# Display result
result_vapply
## Age Score
## 25.00 83.25
##mapply()
#Apply function to multiple vectors simultaneously.
# Add Age and Score
result_mapply <- mapply(
function(age, score){
age + score
},
data1$Age,
data1$Score
)
# Display result
result_mapply
## [1] 100 112 110 111
##map() from purrr
# Install once
install.packages("purrr")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.6'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
# Load package
library(purrr)
# Calculate mean
result_map <- map_dbl(
selected_data,
mean
)
# Display result
result_map
## Age Score
## 25.00 83.25