#ASSIGNMENT ONE

###Import Data from statistical packages (SPSS,STATA AND SAS)

install.packages("haven")
library(haven)
data1 <- read_spss("D:\R Programming\data_spss.sav")

#data_stata <- read_stata("D:\R Programming\data_stata.dta")

#Import data from Excel and CSV 

install.packages("readxl")
library(readxl)

data2 <- read.csv("D:/Programming/CO2_emission.csv")
head(data2)
#data3 <- read_excel("C:/Users/HP/Desktop/AllStaff.xlsx")

#data_sas <- read_sas("C:/Users/HP/Desktop/cinemaal.sas7bdat")
## Error: '\R' is an unrecognized escape in character string (<input>:3:24)
#database management systems

chooseCRANmirror()
## Error in `.chooseMirror()`:
## ! cannot choose a CRAN mirror non-interactively
install.packages("odbc")
## Installing package into 'C:/Users/LENOVO/AppData/Local/R/win-library/4.6'
## (as 'lib' is unspecified)
## Error in `contrib.url()`:
## ! trying to use CRAN without setting a mirror
install.packages("DBI")
## Installing package into 'C:/Users/LENOVO/AppData/Local/R/win-library/4.6'
## (as 'lib' is unspecified)
## Error in `contrib.url()`:
## ! trying to use CRAN without setting a mirror
library(DBI)
## Error in `library()`:
## ! there is no package called 'DBI'
library(odbc)
## Error in `library()`:
## ! there is no package called 'odbc'
con <- dbConnect(
  odbc(),
  Driver = "ODBC Driver 17",
  Server = "localhost\\SQLEXPRESS",
  Database = "FlightDB",
  Trusted_Connection = "Yes"
)
## Error in `dbConnect()`:
## ! could not find function "dbConnect"
library(DBI)
## Error in `library()`:
## ! there is no package called 'DBI'
library(odbc)
## Error in `library()`:
## ! there is no package called 'odbc'
con <- dbConnect(
  odbc(),
  Driver = "SQL Server",
  Server = "localhost\\SQLEXPRESS",
  Database = "FlightDB",
  Trusted_Connection = "True"
)
## Error in `dbConnect()`:
## ! could not find function "dbConnect"
dbListTables(con)
## Error in `dbListTables()`:
## ! could not find function "dbListTables"
passenger_data <- dbReadTable(con, "departuretime")
## Error in `dbReadTable()`:
## ! could not find function "dbReadTable"
head(passenger_data)
## Error:
## ! object 'passenger_data' not found

#ASSIGNMENT TWO

###MERGING DATASET WITH 2 OR 3 VARIABLES In R, merging means combining two data frames based on one or more common variables (columns).

# First dataset
student1 <- data.frame(
  ID = c(1,2,3,4),
  Name = c("Baho","Zayn","Nkusi","Gentille"),
  Year = c(2023,2023,2024,2024),
  Score = c(80,90,75,88)
)

# Second dataset
student2 <- data.frame(
  ID = c(1,2,3,5),
  Name = c("Baho","Umurerwa","Saka","Gykores"),
  Year = c(2023,2023,2024,2024),
  Grade = c("A","A","B","C")
)
head(student1)
##   ID     Name Year Score
## 1  1     Baho 2023    80
## 2  2     Zayn 2023    90
## 3  3    Nkusi 2024    75
## 4  4 Gentille 2024    88
head(student2)
##   ID     Name Year Grade
## 1  1     Baho 2023     A
## 2  2 Umurerwa 2023     A
## 3  3     Saka 2024     B
## 4  5  Gykores 2024     C
#Merge Using TWO Variables
merge(student1, student2, by = c("ID", "Name"))
##   ID Name Year.x Score Year.y Grade
## 1  1 Baho   2023    80   2023     A
# merging by using three variables
merge(student1, student2, by = c("ID", "Name", "Year"))
##   ID Name Year Score Grade
## 1  1 Baho 2023    80     A

#ASSIGNMENT THREE Give examples by using this function

# lets use Select()
#1. When you only need certain columns

#If your dataset has many variables but you only want a few:
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
students <-data.frame(
  ID = c(40,20,30,10),
  Names = c("Baho","Zayn","Nkusi","Gentille"),
  Age = c(30,28,29,40),
  Marks = c(60,48,50,49)
)
head(students)
##   ID    Names Age Marks
## 1 40     Baho  30    60
## 2 20     Zayn  28    48
## 3 30    Nkusi  29    50
## 4 10 Gentille  40    49
select(students, ID:Age) #Select a range of columns
##   ID    Names Age
## 1 40     Baho  30
## 2 20     Zayn  28
## 3 30    Nkusi  29
## 4 10 Gentille  40
select(students, -Score) #Exclude columns
## Error in `select()`:
## ! Can't select columns that don't exist.
## ✖ Column `Score` doesn't exist.
select(students, contains("B")) #Select columns containing text
## data frame with 0 columns and 4 rows
select(students, starts_with("N")) #Select columns starts with
##      Names
## 1     Baho
## 2     Zayn
## 3    Nkusi
## 4 Gentille
#Filter()
#the filter() function is used to select rows based on conditions.
filter(students, Marks < 60 )
##   ID    Names Age Marks
## 1 20     Zayn  28    48
## 2 30    Nkusi  29    50
## 3 10 Gentille  40    49
filter(students, Age == 29 & Marks > 70)
## [1] ID    Names Age   Marks
## <0 rows> (or 0-length row.names)
# arrange()
arrange(students, desc(Marks))
##   ID    Names Age Marks
## 1 40     Baho  30    60
## 2 30    Nkusi  29    50
## 3 10 Gentille  40    49
## 4 20     Zayn  28    48
#mutate()
#mutate() is used to create new columns or change existing columns in a data frame.
# in simple words: add or modify variables (columns)

#Used to create a new column.
mutate(students, Bonus = Marks+20 )
##   ID    Names Age Marks Bonus
## 1 40     Baho  30    60    80
## 2 20     Zayn  28    48    68
## 3 30    Nkusi  29    50    70
## 4 10 Gentille  40    49    69
#group_by() and %>%

students %>%
  group_by(Age) %>%
  summarise(total_marks = sum(Marks))
## # A tibble: 4 × 2
##     Age total_marks
##   <dbl>       <dbl>
## 1    28          48
## 2    29          50
## 3    30          60
## 4    40          49

#ASSIGNMENT FOUR Use online tto show how this 1.trace(), peek inside a function and modify or observe how it works 2.recover()

#trace()
mult <- function(a, b){
  
  if (!is.numeric(a) || !is.numeric(b)){
    stop("Both inputs must be numeric")
  }
  
  result <- a * b
  
  if(result > 100){
    cat("Large result:\n", result)
  } else {
    cat("Small result:\n", result)
  }
}

# Add tracing
trace(
  mult,
  tracer = quote(cat("Tracing: a =", a, "b =", b, "\n"))
)
## [1] "mult"
# Enable recover mode
options(error = recover)

# Run function
mult("5", 10)
## Tracing mult("5", 10) on entry 
## Tracing: a = 5 b = 10
## Error in mult("5", 10): Both inputs must be numeric

#ASSIGNMENT FIVE

Make functions that calculate summary statistics and apply it to a variable to show that it works

and then make a function to calculate two sample t test, then apply it to a function ##STEP ONE: Function to Calculate Summary Statistics We create a function that computes:

.Mean .Median .Minimum .Maximum .Standard Deviation .1st Quartile (Q1) .3rd Quartile (Q3

# create a simple data set
marks <- c(65, 35, 10, 20, 97, 64, 29, 38)
#Create Function for Summary Statistics

# Function to Calculate Summary Statistics
# =========================================

summary_statistics <- function(x){

  # Sort data
  sorted_x <- sort(x)

  # Number of observations
  n <- length(x)


  # Mean
  mean_value <- sum(x) / n

  # Median
  if(n %% 2 == 0){

    median_value <- (sorted_x[n/2] +
                       sorted_x[(n/2) + 1]) / 2

  } else {

    median_value <- sorted_x[(n + 1)/2]
  }

  # First Quartile (Q1)
  Q1_position <- (n + 1) * 0.25

  Q1 <- sorted_x[round(Q1_position)]

  # Third Quartile (Q3)
  Q3_position <- (n + 1) * 0.75

  Q3 <- sorted_x[round(Q3_position)]

  
  # Minimum and Maximum

  min_value <- sorted_x[1]

  max_value <- sorted_x[n]

  # Variance

  variance_value <- sum((x - mean_value)^2) / (n - 1)

  # Standard Deviation
  sd_value <- sqrt(variance_value)

  # Display Results
 

  cat("SUMMARY STATISTICS\n")
  cat("----------------------\n")

  cat("Mean :", mean_value, "\n")

  cat("1st Quartile (Q1) :", Q1, "\n")

  cat("Median :", median_value, "\n")

  cat("3rd Quartile (Q3) :", Q3, "\n")

  cat("Minimum :", min_value, "\n")

  cat("Maximum :", max_value, "\n")

  cat("Standard Deviation :", sd_value, "\n")
}
summary_statistics(marks)
## SUMMARY STATISTICS
## ----------------------
## Mean : 44.75 
## 1st Quartile (Q1) : 20 
## Median : 36.5 
## 3rd Quartile (Q3) : 65 
## Minimum : 10 
## Maximum : 97 
## Standard Deviation : 28.58446

##Create Function for Two Sample t-Test

#Create Two Sample Groups
# Group A data
dataset1 <- c(29, 44, 25, 72, 82)

# Group B data
dataset2 <- c(80, 40, 52, 18, 64)

#A t-test is a statistical test used to compare averages (means) and determine whether the difference between them is statistically significant.

# Function for Two Sample t-Test
# =========================================

two_sample_ttest <- function(x, y){

  # Sample sizes
  n1 <- length(x)
  n2 <- length(y)

  # Means
  mean1 <- sum(x) / n1
  mean2 <- sum(y) / n2

  # Variances
  var1 <- sum((x - mean1)^2) / (n1 - 1)
  var2 <- sum((y - mean2)^2) / (n2 - 1)

  # Standard Error
  SE <- sqrt((var1/n1) + (var2/n2))

  # t-statistic
  t_value <- (mean1 - mean2) / SE

  # Degrees of freedom
  df <- n1 + n2 - 2

  # Display results
  cat("TWO SAMPLE t-TEST\n")
  cat("----------------------\n")

  cat("Mean of Dataset 1 :", mean1, "\n")
  cat("Mean of Dataset 2 :", mean2, "\n")

  cat("Variance of Dataset 1 :", var1, "\n")
  cat("Variance of Dataset 2 :", var2, "\n")

  cat("Standard Error :", SE, "\n")

  cat("t-statistic :", t_value, "\n")

  cat("Degrees of Freedom :", df, "\n")
}

two_sample_ttest(dataset1, dataset1)
## TWO SAMPLE t-TEST
## ----------------------
## Mean of Dataset 1 : 50.4 
## Mean of Dataset 2 : 50.4 
## Variance of Dataset 1 : 652.3 
## Variance of Dataset 2 : 652.3 
## Standard Error : 16.15302 
## t-statistic : 0 
## Degrees of Freedom : 8

#ASSIGNMENT SIX

##Apply Family Functions in R

These functions help apply operations to data efficiently.

sapply() and vapply(), variantss of lapply() that produces vectors, matrices, and arrays as output, instead of lists map(), and mapply which iterare over multiple input data structures in parallel

1.lapply() 2.sapply() 3.vapply() 4.mapply() 5.map() from purr ## Sample data

numbers <- list(a = 1:5, b = 6:10, c = 11:15)

# 1. lapply() -> returns LIST
lapply(numbers, sum)
## $a
## [1] 15
## 
## $b
## [1] 40
## 
## $c
## [1] 65
# 2. sapply() -> simplifies output
sapply(numbers, sum)
##  a  b  c 
## 15 40 65
# 3. vapply() -> safer version of sapply()
vapply(numbers, sum, numeric(1))
##  a  b  c 
## 15 40 65
# 4. apply() -> works on matrices/arrays
mat <- matrix(1:9, nrow = 3)
# Sum columns
apply(mat, 2, sum)
## [1]  6 15 24
# Sum rows
apply(mat, 1, sum)
## [1] 12 15 18
# 6. mapply() -> multiple inputs in parallel
x <- 1:5
y <- 6:10

mapply(function(a, b) a + b, x, y)
## [1]  7  9 11 13 15
# 7. Map() -> like mapply() but returns LIST
Map(function(a, b) a + b, x, y)
## [[1]]
## [1] 7
## 
## [[2]]
## [1] 9
## 
## [[3]]
## [1] 11
## 
## [[4]]
## [1] 13
## 
## [[5]]
## [1] 15