HW1: Import data from statistical package and from Database management systems Importing Data from Statistical Packages we first install the below packages
library(haven)
library(readr)
Importing Data from Database Management Systems we first install the below packages
library(DBI)
library(RMySQL)
Importing a csv file called “winequality-red.csv”
wine_data<- read.csv("winequality-red.csv")
head(wine_data)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
Importing data from statistical package called hsb2 with sas and sav extensions
sas_data <- read_sas("hsb2.sas7bdat")
head(sas_data)
## # A tibble: 6 × 11
## id female race ses schtyp prog read write math science socst
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3 0 1 1 1 2 63 65 48 63 56
## 2 5 0 1 1 1 2 47 40 43 45 31
## 3 16 0 1 1 1 3 47 31 44 36 36
## 4 35 1 1 1 2 1 60 54 50 50 51
## 5 8 1 1 1 1 2 39 44 52 44 48
## 6 19 1 1 1 1 1 28 46 43 44 51
sav_data <- read_sav("hsb2.sav")
head(sav_data)
## # A tibble: 6 × 11
## id female race ses schtyp prog read write math science socst
## <dbl> <dbl+lb> <dbl+l> <dbl+l> <dbl+l> <dbl+l> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 70 0 [male] 4 [whi… 1 [low] 1 [pub… 1 [gen… 57 52 41 47 57
## 2 121 1 [fema… 4 [whi… 2 [mid… 1 [pub… 3 [voc… 68 59 53 63 61
## 3 86 0 [male] 4 [whi… 3 [hig… 1 [pub… 1 [gen… 44 33 54 58 31
## 4 141 0 [male] 4 [whi… 3 [hig… 1 [pub… 3 [voc… 63 44 47 53 56
## 5 172 0 [male] 4 [whi… 2 [mid… 1 [pub… 2 [aca… 47 52 57 53 61
## 6 113 0 [male] 4 [whi… 2 [mid… 1 [pub… 2 [aca… 44 52 51 63 61
HW2: Merging variables from 2 to 3 datasets using $ and %>%
#Before merging datasets with have to first import the datasets to use and install all the necessary packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data_upl <- read.csv("NIFTY-50/UPL.csv", stringsAsFactors = FALSE)
data_vedl <- read.csv("NIFTY-50/VEDL.csv", stringsAsFactors = FALSE)
#I am going to convert date columns before merging variables from UPL and VEDL
data_upl$Date <- as.Date(data_upl$Date)
data_vedl$Date <- as.Date(data_vedl$Date)
#Merging selected variables from both datasets by date
merged_data <- full_join(
data_upl %>% select(Date, UPL_Close = Close, UPL_Open = Open, UPL_High = High),
data_vedl %>% select(Date, VEDL_Close = Close, VEDL_Open = Open, VEDL_High = High),
by = "Date"
)
head(merged_data, 10)
## Date UPL_Close UPL_Open UPL_High VEDL_Close VEDL_Open VEDL_High
## 1 2004-01-23 553.80 100.0 560.0 618.25 600.00 624.0
## 2 2004-01-27 540.35 570.0 570.0 623.10 627.80 650.0
## 3 2004-01-28 538.85 530.0 565.0 629.00 622.95 648.8
## 4 2004-01-29 515.10 530.0 543.5 625.95 630.00 640.5
## 5 2004-01-30 482.95 519.0 519.0 604.70 625.00 633.0
## 6 2004-02-03 461.45 472.0 475.0 607.30 605.00 629.0
## 7 2004-02-04 475.15 442.0 479.0 604.90 621.00 621.0
## 8 2004-02-05 480.15 475.0 489.0 600.60 607.70 615.0
## 9 2004-02-06 480.75 461.1 486.0 589.20 606.90 606.9
## 10 2004-02-09 471.95 475.0 483.0 581.10 596.20 597.0
HW3: How does trace() and recover() works
trace(): temporarily inserts debugging code into an existing function without modifying the original source code.
It allows you to print messages, inspect variables, pause execution, run custom code at specific points inside a function.
Example:
calculate_total <- function(price, quantity) {
total <- price * quantity
tax <- total * 0.18
final <- total + tax
return(final)
}
# Let's add trace to the above codes
trace(
"calculate_total",
tracer = quote({
cat("Price =", price, "\n")
cat("Quantity =", quantity, "\n")
})
)
## [1] "calculate_total"
calculate_total(100, 2)
## Tracing calculate_total(100, 2) on entry
## Price = 100
## Quantity = 2
## [1] 236
recover() is an error-handling debugger. When an error occurs, it lets you enter function environments,inspect variables,move through the call stack. It is extremely useful for diagnosing complex errors.
Example:
divide_numbers <- function(a, b) {
result <- a / b
log(result)
}
process_data <- function() {
x <- 10
y <- 12
divide_numbers(x, y)
}
process_data()
## [1] -0.1823216
# Enabling recovery mode
options(error = recover)
How to use group_by
library(dplyr)
data("cars")
head(cars)
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
cars %>% group_by(speed) %>% summarise(avrg_dist = mean(dist))
## # A tibble: 19 × 2
## speed avrg_dist
## <dbl> <dbl>
## 1 4 6
## 2 7 13
## 3 8 16
## 4 9 10
## 5 10 26
## 6 11 22.5
## 7 12 21.5
## 8 13 35
## 9 14 50.5
## 10 15 33.3
## 11 16 36
## 12 17 40.7
## 13 18 64.5
## 14 19 50
## 15 20 50.4
## 16 22 66
## 17 23 54
## 18 24 93.8
## 19 25 85
HW4: Make functions that calculate summary statistics and apply it to a variable to show that it works Make a function to calculate two sample t test, then apply it to a function
# we will first create a dataset to use while calculating the summary statistics
scores <- c(78, 85, 90, 88, 76, 95, 89, 84, 91, 87)
# Creating the function containing all of the summary statistics
summary_statistics <- function(x) {
results <- list(
Mean = mean(x),
Median = median(x),
Standard_Deviation = sd(x),
Minimum = min(x),
Maximum = max(x),
Variance = var(x),
Range = max(x) - min(x)
)
return(results)
}
summary_statistics(scores)
## $Mean
## [1] 86.3
##
## $Median
## [1] 87.5
##
## $Standard_Deviation
## [1] 5.812821
##
## $Minimum
## [1] 76
##
## $Maximum
## [1] 95
##
## $Variance
## [1] 33.78889
##
## $Range
## [1] 19
What is a Two-Sample t-test?
A two-sample t-test compares the means of two groups to determine whether they are significantly different.
# We will first create sample data
group_A <- c(78, 85, 90, 88, 76, 95, 89)
group_B <- c(72, 80, 79, 83, 77, 81, 75)
# Creating the function with two sample t-test
two_sample_ttest <- function(x, y) {
result <- t.test(x, y)
return(result)
}
# Applying the function to our sample data
two_sample_ttest(group_A, group_B)
##
## Welch Two Sample t-test
##
## data: x and y
## t = 2.636, df = 9.3824, p-value = 0.02619
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.134899 14.293673
## sample estimates:
## mean of x mean of y
## 85.85714 78.14286
Vapply, sapply, lapply functions with examples
# Sample data to use on the functions
x <- list(a = 1:3, b = 4:6)
df <- data.frame(x = 1:4, y = c(2, 4, 6, 8))
# lapply: always returns a list
res_lapply <- lapply(x, sum)
print(res_lapply)
## $a
## [1] 6
##
## $b
## [1] 15
# sapply: tries to simplify result to vector/matrix
res_sapply <- sapply(x, sum)
print(res_sapply)
## a b
## 6 15
# vapply: safe, type-checked version of sapply
res_vapply <- vapply(x, sum, integer(1))
print(res_vapply)
## a b
## 6 15