Assignments

HW1: Import data from statistical package and from Database management systems Importing Data from Statistical Packages we first install the below packages

library(haven)
library(readr)

Importing Data from Database Management Systems we first install the below packages

library(DBI)
library(RMySQL)

Importing a csv file called “winequality-red.csv”

wine_data<- read.csv("winequality-red.csv")
head(wine_data)

##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality
## 1       5
## 2       5
## 3       5
## 4       6
## 5       5
## 6       5

Importing data from statistical package called hsb2 with sas and sav extensions

sas_data <- read_sas("hsb2.sas7bdat")
head(sas_data)

## # A tibble: 6 × 11
##      id female  race   ses schtyp  prog  read write  math science socst
##   <dbl>  <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl>
## 1     3      0     1     1      1     2    63    65    48      63    56
## 2     5      0     1     1      1     2    47    40    43      45    31
## 3    16      0     1     1      1     3    47    31    44      36    36
## 4    35      1     1     1      2     1    60    54    50      50    51
## 5     8      1     1     1      1     2    39    44    52      44    48
## 6    19      1     1     1      1     1    28    46    43      44    51

sav_data <- read_sav("hsb2.sav")
head(sav_data)

## # A tibble: 6 × 11
##      id female   race    ses     schtyp  prog     read write  math science socst
##   <dbl> <dbl+lb> <dbl+l> <dbl+l> <dbl+l> <dbl+l> <dbl> <dbl> <dbl>   <dbl> <dbl>
## 1    70 0 [male] 4 [whi… 1 [low] 1 [pub… 1 [gen…    57    52    41      47    57
## 2   121 1 [fema… 4 [whi… 2 [mid… 1 [pub… 3 [voc…    68    59    53      63    61
## 3    86 0 [male] 4 [whi… 3 [hig… 1 [pub… 1 [gen…    44    33    54      58    31
## 4   141 0 [male] 4 [whi… 3 [hig… 1 [pub… 3 [voc…    63    44    47      53    56
## 5   172 0 [male] 4 [whi… 2 [mid… 1 [pub… 2 [aca…    47    52    57      53    61
## 6   113 0 [male] 4 [whi… 2 [mid… 1 [pub… 2 [aca…    44    52    51      63    61

HW2: Merging variables from 2 to 3 datasets using $ and %>%

#Before merging datasets with have to first import the datasets to use and install all the necessary packages 
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data_upl <- read.csv("NIFTY-50/UPL.csv", stringsAsFactors = FALSE)
data_vedl <- read.csv("NIFTY-50/VEDL.csv", stringsAsFactors = FALSE)

#I am going to convert date columns before merging variables from UPL and VEDL
data_upl$Date <- as.Date(data_upl$Date)
data_vedl$Date <- as.Date(data_vedl$Date)
#Merging selected variables from both datasets by date
merged_data <- full_join(
    data_upl %>% select(Date, UPL_Close = Close, UPL_Open = Open, UPL_High = High),
    data_vedl %>% select(Date, VEDL_Close = Close, VEDL_Open = Open, VEDL_High = High),
    by = "Date"
)
head(merged_data, 10)

##          Date UPL_Close UPL_Open UPL_High VEDL_Close VEDL_Open VEDL_High
## 1  2004-01-23    553.80    100.0    560.0     618.25    600.00     624.0
## 2  2004-01-27    540.35    570.0    570.0     623.10    627.80     650.0
## 3  2004-01-28    538.85    530.0    565.0     629.00    622.95     648.8
## 4  2004-01-29    515.10    530.0    543.5     625.95    630.00     640.5
## 5  2004-01-30    482.95    519.0    519.0     604.70    625.00     633.0
## 6  2004-02-03    461.45    472.0    475.0     607.30    605.00     629.0
## 7  2004-02-04    475.15    442.0    479.0     604.90    621.00     621.0
## 8  2004-02-05    480.15    475.0    489.0     600.60    607.70     615.0
## 9  2004-02-06    480.75    461.1    486.0     589.20    606.90     606.9
## 10 2004-02-09    471.95    475.0    483.0     581.10    596.20     597.0

HW3: How does trace() and recover() works

trace(): temporarily inserts debugging code into an existing function without modifying the original source code.

It allows you to print messages, inspect variables, pause execution, run custom code at specific points inside a function.

Example:

calculate_total <- function(price, quantity) {
  total <- price * quantity
  tax <- total * 0.18
  final <- total + tax
  return(final)
}

# Let's add trace to the above codes

trace(
  "calculate_total",
  tracer = quote({
    cat("Price =", price, "\n")
    cat("Quantity =", quantity, "\n")
  })
)

## [1] "calculate_total"

calculate_total(100, 2)

## Tracing calculate_total(100, 2) on entry 
## Price = 100 
## Quantity = 2

## [1] 236

recover() is an error-handling debugger. When an error occurs, it lets you enter function environments,inspect variables,move through the call stack. It is extremely useful for diagnosing complex errors.

Example:

divide_numbers <- function(a, b) {
  result <- a / b
  log(result)
}

process_data <- function() {
  x <- 10
  y <- 12
  divide_numbers(x, y)
}

process_data()

## [1] -0.1823216

# Enabling recovery mode
options(error = recover)

How to use group_by

library(dplyr)
data("cars")
head(cars)

##   speed dist
## 1     4    2
## 2     4   10
## 3     7    4
## 4     7   22
## 5     8   16
## 6     9   10

cars %>% group_by(speed) %>% summarise(avrg_dist = mean(dist))

## # A tibble: 19 × 2
##    speed avrg_dist
##    <dbl>     <dbl>
##  1     4       6  
##  2     7      13  
##  3     8      16  
##  4     9      10  
##  5    10      26  
##  6    11      22.5
##  7    12      21.5
##  8    13      35  
##  9    14      50.5
## 10    15      33.3
## 11    16      36  
## 12    17      40.7
## 13    18      64.5
## 14    19      50  
## 15    20      50.4
## 16    22      66  
## 17    23      54  
## 18    24      93.8
## 19    25      85

HW4: Make functions that calculate summary statistics and apply it to a variable to show that it works Make a function to calculate two sample t test, then apply it to a function

# we will first create a dataset to use while calculating the summary statistics
scores <- c(78, 85, 90, 88, 76, 95, 89, 84, 91, 87)

# Creating the function containing all of the summary statistics 

summary_statistics <- function(x) {
  results <- list(
    Mean = mean(x),
    Median = median(x),
    Standard_Deviation = sd(x),
    Minimum = min(x),
    Maximum = max(x),
    Variance = var(x),
    Range = max(x) - min(x)
  )
  
  return(results)
}

summary_statistics(scores)

## $Mean
## [1] 86.3
## 
## $Median
## [1] 87.5
## 
## $Standard_Deviation
## [1] 5.812821
## 
## $Minimum
## [1] 76
## 
## $Maximum
## [1] 95
## 
## $Variance
## [1] 33.78889
## 
## $Range
## [1] 19

What is a Two-Sample t-test?

A two-sample t-test compares the means of two groups to determine whether they are significantly different.

# We will first create sample data

group_A <- c(78, 85, 90, 88, 76, 95, 89)
group_B <- c(72, 80, 79, 83, 77, 81, 75)

# Creating the function with two sample t-test
two_sample_ttest <- function(x, y) {
  result <- t.test(x, y)
  return(result)
}

# Applying the function to our sample data
two_sample_ttest(group_A, group_B)

## 
##  Welch Two Sample t-test
## 
## data:  x and y
## t = 2.636, df = 9.3824, p-value = 0.02619
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   1.134899 14.293673
## sample estimates:
## mean of x mean of y 
##  85.85714  78.14286

Vapply, sapply, lapply functions with examples

# Sample data to use on the functions
x <- list(a = 1:3, b = 4:6)
df <- data.frame(x = 1:4, y = c(2, 4, 6, 8))

# lapply: always returns a list
res_lapply <- lapply(x, sum)
print(res_lapply)

## $a
## [1] 6
## 
## $b
## [1] 15

# sapply: tries to simplify result to vector/matrix
res_sapply <- sapply(x, sum)
print(res_sapply)

##  a  b 
##  6 15

# vapply: safe, type-checked version of sapply
res_vapply <- vapply(x, sum, integer(1))
print(res_vapply)

##  a  b 
##  6 15

Assignments

Olga Isengwe

2026-05-24