#ASSIGNMENT ONE

###Import Data from statistical packages (SPSS,STATA AND SAS)

install.packages("haven")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.6'
## (as 'lib' is unspecified)
## Error in `contrib.url()`:
## ! trying to use CRAN without setting a mirror
library(haven)
#data1 <- read_spss("C:/Users/HP/Desktop/data_spss.sav")

#data_stata <- read_stata("C:/Users/HP/Desktop/data_stata.dta")

#Import data from Excel and CSV 

install.packages("readxl")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.6'
## (as 'lib' is unspecified)
## Error in `contrib.url()`:
## ! trying to use CRAN without setting a mirror
library(readxl)

data2 <- read.csv("C:/Users/HP/Desktop/My_Masters/HW1_Fish/Fish.csv")
head(data2)
##   Species Weight Length1 Length2 Length3  Height  Width
## 1   Bream    242    23.2    25.4    30.0 11.5200 4.0200
## 2   Bream    290    24.0    26.3    31.2 12.4800 4.3056
## 3   Bream    340    23.9    26.5    31.1 12.3778 4.6961
## 4   Bream    363    26.3    29.0    33.5 12.7300 4.4555
## 5   Bream    430    26.5    29.0    34.0 12.4440 5.1340
## 6   Bream    450    26.8    29.7    34.7 13.6024 4.9274
#data3 <- read_excel("C:/Users/HP/Desktop/FEB INTAKE.xlsx")

#data_sas <- read_sas("C:/Users/HP/Desktop/crimestats.sas7bdat")
#database management systems

#chooseCRANmirror()
install.packages("odbc")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.6'
## (as 'lib' is unspecified)
## Error in `contrib.url()`:
## ! trying to use CRAN without setting a mirror
install.packages("DBI")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.6'
## (as 'lib' is unspecified)
## Error in `contrib.url()`:
## ! trying to use CRAN without setting a mirror
library(DBI)
library(odbc)
con <- dbConnect(
  odbc(),
  Driver = "ODBC Driver 17 for SQL Server",
  Server = "localhost\\SQLEXPRESS",
  Database = "GradeDB",
  Trusted_Connection = "Yes"
)

con <- dbConnect(
  odbc(),
  Driver = "SQL Server",
  Server = "localhost\\SQLEXPRESS",
  Database = "GradeDB",
  Trusted_Connection = "True"
)
# Display first 5 tables only
head(dbListTables(con), 10)
##  [1] "21stGraduation"       "23953"                "A1_2017"             
##  [4] "A12018All"            "A1All"                "A1All2"              
##  [7] "A1andProvA0_May2015B" "A1andProvA0_May2015C" "A1andProvAO_May2015" 
## [10] "A1ApplicationMay2019"
students_data <- dbReadTable(con, "v_course")
head(students_data)
##   COURSECODE                                CourseName
## 1          1                          Zero Credit    1
## 2  ACCT  229                    Fiscalité    ACCT  229
## 3  ACCT 1100 Initiation à La Comptabilité    ACCT 1100
## 4  ACCT 1101        Comptabilité Générale    ACCT 1101
## 5  ACCT 1102 Initiation à La Comptabilité    ACCT 1102
## 6  ACCT 1103  Comptabilité Internationale    ACCT 1103

#ASSIGNMENT TWO

###MERGING DATASET WITH 2 OR 3 VARIABLES In R, merging means combining two data frames based on one or more common variables (columns).

# First dataset
data1 <- data.frame(
  ID = c(1,2,3,4),
  Name = c("John","jackson","Paul","Fabrice"),
  Year = c(2023,2023,2024,2024),
  Score = c(80,90,75,88)
)

# Second dataset
data2 <- data.frame(
  ID = c(1,2,3,5),
  Name = c("John","jackson","Paul","Fabrice"),
  Year = c(2023,2023,2024,2024),
  Grade = c("A","A","B","C")
)
head(data1)
##   ID    Name Year Score
## 1  1    John 2023    80
## 2  2 jackson 2023    90
## 3  3    Paul 2024    75
## 4  4 Fabrice 2024    88
head(data2)
##   ID    Name Year Grade
## 1  1    John 2023     A
## 2  2 jackson 2023     A
## 3  3    Paul 2024     B
## 4  5 Fabrice 2024     C
#Merge Using TWO Variables
merge(data1, data2, by = c("ID", "Name"))
##   ID    Name Year.x Score Year.y Grade
## 1  1    John   2023    80   2023     A
## 2  2 jackson   2023    90   2023     A
## 3  3    Paul   2024    75   2024     B
# merging by using three variables
merge(data1, data2, by = c("ID", "Name", "Year"))
##   ID    Name Year Score Grade
## 1  1    John 2023    80     A
## 2  2 jackson 2023    90     A
## 3  3    Paul 2024    75     B
# merging using left and right join
install.packages("dplyr")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.6'
## (as 'lib' is unspecified)
## Error in `contrib.url()`:
## ! trying to use CRAN without setting a mirror
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
left_join(data1, data2, by="ID")
##   ID  Name.x Year.x Score  Name.y Year.y Grade
## 1  1    John   2023    80    John   2023     A
## 2  2 jackson   2023    90 jackson   2023     A
## 3  3    Paul   2024    75    Paul   2024     B
## 4  4 Fabrice   2024    88    <NA>     NA  <NA>
right_join(data1, data2, by="ID")
##   ID  Name.x Year.x Score  Name.y Year.y Grade
## 1  1    John   2023    80    John   2023     A
## 2  2 jackson   2023    90 jackson   2023     A
## 3  3    Paul   2024    75    Paul   2024     B
## 4  5    <NA>     NA    NA Fabrice   2024     C

#ASSIGNMENT THREE Give examples by using this function .select(), filter(), arrange(), rename(), mutate(), group_by(), and %>%

# lets use Select()
#1. When you only need certain columns

#If your dataset has many variables but you only want a few:
library(dplyr)

students <-data.frame(
  ID = c(40,20,30,10),
  Names = c("Jackson","Fabrice","Giddy","Paul"),
  Age = c(30,28,29,40),
  Marks = c(60,48,50,49)
)

select(students, ID:Age) #Select a range of columns
##   ID   Names Age
## 1 40 Jackson  30
## 2 20 Fabrice  28
## 3 30   Giddy  29
## 4 10    Paul  40
#select(students, -Age) #Exclude columns
#select(students, contains("a")) #Select columns containing text
#select(students, starts_with("M")) #Select columns starts with
#select(students,Student_Names = Names) #Rename while selecting
#Filter()
#the filter() function is used to select rows based on conditions.
filter(students, Marks < 50 )
##   ID   Names Age Marks
## 1 20 Fabrice  28    48
## 2 10    Paul  40    49
#filter(students, Age == 20 & Marks > 80) #Multiple conditions
# arrange()

#Used to sort data. by default its ascending order
arrange(students, desc(Marks))
##   ID   Names Age Marks
## 1 40 Jackson  30    60
## 2 30   Giddy  29    50
## 3 10    Paul  40    49
## 4 20 Fabrice  28    48
#mutate()
#mutate() is used to create new columns or change existing columns in a data frame.
# in simple words: add or modify variables (columns)

#Used to create a new column.
mutate(students, Bonus = Marks + 5)
##   ID   Names Age Marks Bonus
## 1 40 Jackson  30    60    65
## 2 20 Fabrice  28    48    53
## 3 30   Giddy  29    50    55
## 4 10    Paul  40    49    54
#group_by() and %>%
#organize data and pass it step-by-step through operations.
  
students %>%
  group_by(Age) %>%
  summarise(total_marks = sum(Marks))
## # A tibble: 4 × 2
##     Age total_marks
##   <dbl>       <dbl>
## 1    28          48
## 2    29          50
## 3    30          60
## 4    40          49

#ASSIGNMENT FOUR Use online tto show how this 1.trace(), 2.recover()

  1. trace(), the trace() function is used to temporarily insert code into a function so you can debug or understand what the function is doing internally. Helps you see what happens inside a function

  2. recover(), Used to debug errors interactively when a function crashes When error happens, R will show:

Enter a frame number, or 0 to exit

#trace()
#trace() = “peek inside a function and modify or observe how it runs”

div <- function(x, y){
  
  if (y == 0){
    stop("Cannot divide by zero")
  }
  
  results <- x / y
  
  if(results >= 0){
    cat("Answer is positive\n",results)
  } else {
    cat("Answer is negative\n",results)
  }
  
}

trace(
  div,
  tracer = quote(cat("Tracing: x=" ,x,"y= ",y))
)
## [1] "div"
#3 — Use recover()

options(error= recover)
div(4,0)
## Tracing div(4, 0) on entry 
## Tracing: x= 4 y=  0
## Error in div(4, 0): Cannot divide by zero

#ASSIGNMENT FIVE

Make functions that calculate summary statistics and apply it to a variable to show that it works

and then make a function to calculate two sample t test, then apply it to a function ##STEP ONE: Function to Calculate Summary Statistics We create a function that computes:

.Mean .Median .Minimum .Maximum .Standard Deviation .1st Quartile (Q1) .3rd Quartile (Q3

# create a simple data set
marks <- c(65, 75, 99, 80, 97, 64, 79, 98)
#Create Function for Summary Statistics

# Function to Calculate Summary Statistics
# =========================================

summary_statistics <- function(x){

  # Sort data
  sorted_x <- sort(x)

  # Number of observations
  n <- length(x)


  # Mean
  mean_value <- sum(x) / n

  # Median
  if(n %% 2 == 0){

    median_value <- (sorted_x[n/2] +
                       sorted_x[(n/2) + 1]) / 2

  } else {

    median_value <- sorted_x[(n + 1)/2]
  }

  # First Quartile (Q1)
  Q1_position <- (n + 1) * 0.25

  Q1 <- sorted_x[round(Q1_position)]

  # Third Quartile (Q3)
  Q3_position <- (n + 1) * 0.75

  Q3 <- sorted_x[round(Q3_position)]

  
  # Minimum and Maximum

  min_value <- sorted_x[1]

  max_value <- sorted_x[n]

  # Variance

  variance_value <- sum((x - mean_value)^2) / (n - 1)

  # Standard Deviation
  sd_value <- sqrt(variance_value)

  # Display Results
 

  cat("SUMMARY STATISTICS\n")
  cat("----------------------\n")

  cat("Mean :", mean_value, "\n")

  cat("1st Quartile (Q1) :", Q1, "\n")

  cat("Median :", median_value, "\n")

  cat("3rd Quartile (Q3) :", Q3, "\n")

  cat("Minimum :", min_value, "\n")

  cat("Maximum :", max_value, "\n")

  cat("Standard Deviation :", sd_value, "\n")
}
summary_statistics(marks)
## SUMMARY STATISTICS
## ----------------------
## Mean : 82.125 
## 1st Quartile (Q1) : 65 
## Median : 79.5 
## 3rd Quartile (Q3) : 98 
## Minimum : 64 
## Maximum : 99 
## Standard Deviation : 14.367

##Create Function for Two Sample t-Test

t- Test Calculates formula:

\[ t = \frac{\bar{x}_1 - \bar{x}_2} {\sqrt{\frac{s_1^2}{n_1} + \frac{s_2^2}{n_2}}} \]

#Create Two Sample Groups
# Group A data
data4 <- c(79, 74, 95, 82, 92)

# Group B data
data5 <- c(60, 70, 72, 78, 74)

#A t-test is a statistical test used to compare averages (means) and determine whether the difference between them is statistically significant.

# Function for Two Sample t-Test
# =========================================

two_sample_ttest <- function(x, y){

  # Sample sizes
  n1 <- length(x)
  n2 <- length(y)

  # Means
  mean1 <- sum(x) / n1
  mean2 <- sum(y) / n2

  # Variances
  var1 <- sum((x - mean1)^2) / (n1 - 1)
  var2 <- sum((y - mean2)^2) / (n2 - 1)

  # Standard Error
  SE <- sqrt((var1/n1) + (var2/n2))

  # t-statistic
  t_value <- (mean1 - mean2) / SE

  # Degrees of freedom
  df <- n1 + n2 - 2

  # Display results
  cat("TWO SAMPLE t-TEST\n")
  cat("----------------------\n")

  cat("Mean of Data 4 :", mean1, "\n")
  cat("Mean of Data 5 :", mean2, "\n")

  cat("Variance of Data 4 :", var1, "\n")
  cat("Variance of Data 5 :", var2, "\n")

  cat("Standard Error :", SE, "\n")

  cat("t-statistic :", t_value, "\n")

  cat("Degrees of Freedom :", df, "\n")
}

two_sample_ttest(data4, data5)
## TWO SAMPLE t-TEST
## ----------------------
## Mean of Data 4 : 84.4 
## Mean of Data 5 : 70.8 
## Variance of Data 4 : 78.3 
## Variance of Data 5 : 45.2 
## Standard Error : 4.969909 
## t-statistic : 2.736468 
## Degrees of Freedom : 8

#ASSIGNMENT SIX

##Apply Family Functions in R

These functions help apply operations to data efficiently.

sapply() and vapply(), variantss of lapply() that produces vectors, matrices, and arrays as output, instead of lists map(), and mapply which iterare over multiple input data structures in parallel

1.lapply() 2.sapply() 3.vapply() 4.mapply() 5.map() from purr

###lapply

#lapply() applies a function to each element of a list and returns a LIST.

# Create data frame
data1 <- data.frame(
  ID = c(1,2,3,4),
  Name = c("John","Jackson","Paul","Fabrice"),
  Year = c(2023,2023,2024,2024),
  Age = c(20,22,35,23),
  Score = c(80,90,75,88)
)

# Display dataset
data1
##   ID    Name Year Age Score
## 1  1    John 2023  20    80
## 2  2 Jackson 2023  22    90
## 3  3    Paul 2024  35    75
## 4  4 Fabrice 2024  23    88
#lapply() applies a function to each element of a list and returns a LIST.
#Select Only Age and Score Columns
# Select Age and Score columns
selected_data <- data1[, c("Age", "Score")]

# Display selected data
selected_data
##   Age Score
## 1  20    80
## 2  22    90
## 3  35    75
## 4  23    88
#Calculate Mean Using lapply()
# Calculate mean
result_lapply <- lapply(
  selected_data,
  mean
)

# Display result
result_lapply
## $Age
## [1] 25
## 
## $Score
## [1] 83.25

##sapply()

#Calculate Mean Using sapply()

#Simplifies output into vector/matrix.
# Calculate mean
result_sapply <- sapply(
  selected_data,
  mean
)

# Display result
result_sapply
##   Age Score 
## 25.00 83.25

##vapply()

#Safer version of sapply.
#Calculate Mean Using vapply()
# Calculate mean
result_vapply <- vapply(
  selected_data,
  mean,
  numeric(1)
)

# Display result
result_vapply
##   Age Score 
## 25.00 83.25

##mapply()

#Apply function to multiple vectors simultaneously.
# Add Age and Score
result_mapply <- mapply(
  function(age, score){

    age + score

  },

  data1$Age,
  data1$Score
)

# Display result
result_mapply
## [1] 100 112 110 111

##map() from purrr

# Install once
install.packages("purrr")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.6'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
# Load package
library(purrr)
# Calculate mean
result_map <- map_dbl(
  selected_data,
  mean
)

# Display result
result_map
##   Age Score 
## 25.00 83.25