options(repos = c(CRAN = “https://cloud.r-project.org”)) install.packages(“readxl”)
library(readxl)
student_data <- read.csv("C:\\Moses\\StudentsPerformance.csv")
str(student_data)
## 'data.frame': 1000 obs. of 8 variables:
## $ gender : chr "female" "female" "female" "male" ...
## $ ethnicity : chr "group B" "group C" "group B" "group A" ...
## $ parental.level.of.education: chr "bachelor's degree" "some college" "master's degree" "associate's degree" ...
## $ lunch : chr "standard" "standard" "standard" "free/reduced" ...
## $ test.preparation.course : chr "none" "completed" "none" "none" ...
## $ math.score : int 72 69 90 47 76 71 88 40 64 38 ...
## $ reading.score : int 72 90 95 57 78 83 95 43 64 60 ...
## $ writing.score : int 74 88 93 44 75 78 92 39 67 50 ...
head(student_data)
## gender ethnicity parental.level.of.education lunch
## 1 female group B bachelor's degree standard
## 2 female group C some college standard
## 3 female group B master's degree standard
## 4 male group A associate's degree free/reduced
## 5 male group C some college standard
## 6 female group B associate's degree standard
## test.preparation.course math.score reading.score writing.score
## 1 none 72 72 74
## 2 completed 69 90 88
## 3 none 90 95 93
## 4 none 47 57 44
## 5 none 76 78 75
## 6 none 71 83 78
tail(student_data)
## gender ethnicity parental.level.of.education lunch
## 995 male group A high school standard
## 996 female group E master's degree standard
## 997 male group C high school free/reduced
## 998 female group C high school free/reduced
## 999 female group D some college standard
## 1000 female group D some college free/reduced
## test.preparation.course math.score reading.score writing.score
## 995 none 63 63 62
## 996 completed 88 99 95
## 997 none 62 55 55
## 998 completed 59 71 65
## 999 completed 68 78 77
## 1000 none 77 86 86
View(student_data)
data <- student_data[2:3,]
print(data)
## gender ethnicity parental.level.of.education lunch test.preparation.course
## 2 female group C some college standard completed
## 3 female group B master's degree standard none
## math.score reading.score writing.score
## 2 69 90 88
## 3 90 95 93
summary(student_data$writing.score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.00 57.75 69.00 68.05 79.00 100.00
hist(student_data$reading.score, col = "blue")
library(RMySQL)
## Loading required package: DBI
con <- dbConnect(
MySQL(),
user = "root",
password = "Motta@123",
host = "localhost",
port = 3306,
dbname = "sakila"
)
tables <- dbListTables(con)
print(tables)
## [1] "actor" "actor_info"
## [3] "address" "category"
## [5] "city" "country"
## [7] "customer" "customer_list"
## [9] "film" "film_actor"
## [11] "film_category" "film_list"
## [13] "film_text" "inventory"
## [15] "language" "nicer_but_slower_film_list"
## [17] "payment" "rental"
## [19] "sales_by_film_category" "sales_by_store"
## [21] "staff" "staff_list"
## [23] "store"
###Example 1: Merge Using Two Variables
df1 <- data.frame(
id = c(1, 2, 3, 4),
subject = c("Python", "C programming", "Python", "R programming"),
name = c("Moses", "Kevine", "Paul", "Jackson")
)
df2 <- data.frame(
id = c(1, 2, 3, 4),
subject = c("Python", "C programming", "Python", "R programming"),
marks = c(80, 90, 75, 85)
)
merged_data <- merge(df1, df2, by = c("id", "subject"))
merged_data
## id subject name marks
## 1 1 Python Moses 80
## 2 2 C programming Kevine 90
## 3 3 Python Paul 75
## 4 4 R programming Jackson 85
###Example 2: Merge with Different Column Names (Left Join Style)
df1 <- data.frame(
student_id = c(1, 2, 3, 4),
subject = c("Python", "C programming", "Python", "R programming"),
name = c("Moses", "Kevine", "Paul", "Jackson")
)
df2 <- data.frame(
id = c(1, 2, 3, 4),
subject = c("Python", "C programming", "Python", "R programming"),
marks = c(80, 90, 75, 85)
)
merged_data <- merge(df1, df2,
by.x = "student_id",
by.y = "id",
all.x = TRUE)
merged_data
## student_id subject.x name subject.y marks
## 1 1 Python Moses Python 80
## 2 2 C programming Kevine C programming 90
## 3 3 Python Paul Python 75
## 4 4 R programming Jackson R programming 85
###select()
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
student <- student_data %>% select (gender, ethnicity, lunch, math.score)
head(student)
## gender ethnicity lunch math.score
## 1 female group B standard 72
## 2 female group C standard 69
## 3 female group B standard 90
## 4 male group A free/reduced 47
## 5 male group C standard 76
## 6 female group B standard 71
###filter()
library(dplyr)
student<- student_data %>% filter(gender == "female" & math.score > 95)
head(student)
## gender ethnicity parental.level.of.education lunch test.preparation.course
## 1 female group E bachelor's degree standard completed
## 2 female group C bachelor's degree standard completed
## 3 female group D some high school standard completed
## 4 female group E high school standard none
## 5 female group E some college standard none
## 6 female group E bachelor's degree standard none
## math.score reading.score writing.score
## 1 99 100 100
## 2 96 100 100
## 3 97 100 100
## 4 99 93 90
## 5 100 92 97
## 6 100 100 100
###arrange()
library(dplyr)
student <- student_data %>% arrange(gender, desc(math.score))
head(student)
## gender ethnicity parental.level.of.education lunch test.preparation.course
## 1 female group E some college standard none
## 2 female group E bachelor's degree standard none
## 3 female group E associate's degree standard none
## 4 female group E bachelor's degree standard completed
## 5 female group E high school standard none
## 6 female group D some college standard none
## math.score reading.score writing.score
## 1 100 92 97
## 2 100 100 100
## 3 100 100 100
## 4 99 100 100
## 5 99 93 90
## 6 98 100 99
###rename()
student <- student_data %>% rename(student_gender = gender, math_marks = math.score)
head(student)
## student_gender ethnicity parental.level.of.education lunch
## 1 female group B bachelor's degree standard
## 2 female group C some college standard
## 3 female group B master's degree standard
## 4 male group A associate's degree free/reduced
## 5 male group C some college standard
## 6 female group B associate's degree standard
## test.preparation.course math_marks reading.score writing.score
## 1 none 72 72 74
## 2 completed 69 90 88
## 3 none 90 95 93
## 4 none 47 57 44
## 5 none 76 78 75
## 6 none 71 83 78
###mutate()
student <- student_data %>% mutate(total_marks = math.score + reading.score + writing.score) ## adding new column of total score
head(student)
## gender ethnicity parental.level.of.education lunch
## 1 female group B bachelor's degree standard
## 2 female group C some college standard
## 3 female group B master's degree standard
## 4 male group A associate's degree free/reduced
## 5 male group C some college standard
## 6 female group B associate's degree standard
## test.preparation.course math.score reading.score writing.score total_marks
## 1 none 72 72 74 218
## 2 completed 69 90 88 247
## 3 none 90 95 93 278
## 4 none 47 57 44 148
## 5 none 76 78 75 229
## 6 none 71 83 78 232
###mutate()
student <- student_data %>% mutate(total_marks = math.score + reading.score + writing.score, avg = (math.score + reading.score + writing.score)/3)
head(student)
## gender ethnicity parental.level.of.education lunch
## 1 female group B bachelor's degree standard
## 2 female group C some college standard
## 3 female group B master's degree standard
## 4 male group A associate's degree free/reduced
## 5 male group C some college standard
## 6 female group B associate's degree standard
## test.preparation.course math.score reading.score writing.score total_marks
## 1 none 72 72 74 218
## 2 completed 69 90 88 247
## 3 none 90 95 93 278
## 4 none 47 57 44 148
## 5 none 76 78 75 229
## 6 none 71 83 78 232
## avg
## 1 72.66667
## 2 82.33333
## 3 92.66667
## 4 49.33333
## 5 76.33333
## 6 77.33333
###mutate()
student <- student_data %>% mutate(total_marks = math.score + reading.score + writing.score, avg = (math.score + reading.score + writing.score)/3, result = ifelse(((math.score + reading.score + writing.score)/3) >= 50, "Pass", "Fail"))
head(student)
## gender ethnicity parental.level.of.education lunch
## 1 female group B bachelor's degree standard
## 2 female group C some college standard
## 3 female group B master's degree standard
## 4 male group A associate's degree free/reduced
## 5 male group C some college standard
## 6 female group B associate's degree standard
## test.preparation.course math.score reading.score writing.score total_marks
## 1 none 72 72 74 218
## 2 completed 69 90 88 247
## 3 none 90 95 93 278
## 4 none 47 57 44 148
## 5 none 76 78 75 229
## 6 none 71 83 78 232
## avg result
## 1 72.66667 Pass
## 2 82.33333 Pass
## 3 92.66667 Pass
## 4 49.33333 Fail
## 5 76.33333 Pass
## 6 77.33333 Pass
student <- student_data %>% group_by(gender) %>% summarise(avg = mean(math.score))
head(student)
## # A tibble: 2 × 2
## gender avg
## <chr> <dbl>
## 1 female 63.6
## 2 male 68.7
math_marks <- student_data$math.score
myfun <- function(math_marks) {
avg <- mean(math_marks)
print(avg)
}
trace(myfun, quote(cat("Function is running\n")), print = FALSE)
## [1] "myfun"
myfun(math_marks)
## Function is running
## [1] 66.089
enable recover function
options(error = recover)
###Example 1: Division by 0
f <- function(x) {
y <- x / 0
y
}
f(10)
## [1] Inf
marks <- student_data$math.score
my_summary <- function(math_marks) {
# Sort the vector
x <- sort(marks)
n <- length(marks)
# Minimum
minimum <- marks[1]
# Maximum
maximum <- marks[n]
# Mean
total <- 0
for(i in marks) {
total <- total + i
}
mean_value <- total / n
# Median
if(n %% 2 == 1) {
median_value <- marks[(n + 1) / 2]
} else {
median_value <- (x[n/2] + x[(n/2) + 1]) / 2
}
# First Quartile (Q1)
q1_position <- (n + 1) / 4
q1 <- x[round(q1_position)]
# Third Quartile (Q3)
q3_position <- 3 * (n + 1) / 4
q3 <- x[round(q3_position)]
# Display results
cat("Minimum :", minimum, "\n")
cat("1st Quartile :", q1, "\n")
cat("Median :", median_value, "\n")
cat("Mean :", mean_value, "\n")
cat("3rd Quartile :", q3, "\n")
cat("Maximum :", maximum, "\n")
}
my_summary(marks)
## Minimum : 72
## 1st Quartile : 57
## Median : 66
## Mean : 66.089
## 3rd Quartile : 77
## Maximum : 77
ttest_function <- function(x, y) {
# Find lengths manually
n1 <- 0
for(i in x) {
n1 <- n1 + 1
}
n2 <- 0
for(i in y) {
n2 <- n2 + 1
}
# Compute means manually
sum1 <- 0
for(i in x) {
sum1 <- sum1 + i
}
mean1 <- sum1 / n1
sum2 <- 0
for(i in y) {
sum2 <- sum2 + i
}
mean2 <- sum2 / n2
# Compute variances manually
ss1 <- 0
for(i in x) {
ss1 <- ss1 + (i - mean1)^2
}
var1 <- ss1 / (n1 - 1)
ss2 <- 0
for(i in y) {
ss2 <- ss2 + (i - mean2)^2
}
var2 <- ss2 / (n2 - 1)
# Pooled variance
pooled_var <- (
((n1 - 1) * var1) +
((n2 - 1) * var2)
) / (n1 + n2 - 2)
# Standard error
se <- sqrt(
pooled_var * ((1 / n1) + (1 / n2))
)
# t statistic
t_value <- (mean1 - mean2) / se
# Print results
cat("Sample 1 Mean :", mean1, "\n")
cat("Sample 2 Mean :", mean2, "\n")
cat("Variance 1 :", var1, "\n")
cat("Variance 2 :", var2, "\n")
cat("t statistic :", t_value, "\n")
}
x <- student_data$math.score
y <- student_data$reading.score
ttest_function(x,y)
## Sample 1 Mean : 66.089
## Sample 2 Mean : 69.169
## Variance 1 : 229.919
## Variance 2 : 213.1656
## t statistic : -4.627084
###lapply(): Applies a function to each element and keeps output in a list
math <- student_data$math.score
reading <- student_data$reading.score
numbers <- list(math,reading)
lapply(numbers, sum)
## [[1]]
## [1] 66089
##
## [[2]]
## [1] 69169
###sapply() : Like lapply(), but tries to simplify output.
math <- student_data$math.score
reading <- student_data$reading.score
numbers <- list(math,reading)
sapply(numbers, sum)
## [1] 66089 69169
###vapply(): You MUST specify expected output type
math <- student_data$math.score
reading <- student_data$reading.score
numbers <- list(math,reading)
vapply(numbers, sum, numeric(1))
## [1] 66089 69169
###mapply(): Applies function across multiple vectors at the same time.
math <- student_data$math.score
reading <- student_data$reading.score
mapply(sum, math)
## [1] 72 69 90 47 76 71 88 40 64 38 58 40 65 78 50 69 88 18
## [19] 46 54 66 65 44 69 74 73 69 67 70 62 69 63 56 40 97 81
## [37] 74 50 75 57 55 58 53 59 50 65 55 66 57 82 53 77 53 88
## [55] 71 33 82 52 58 0 79 39 62 69 59 67 45 60 61 39 58 63
## [73] 41 61 49 44 30 80 61 62 47 49 50 72 42 73 76 71 58 73
## [91] 65 27 71 43 79 78 65 63 58 65 79 68 85 60 98 58 87 66
## [109] 52 70 77 62 54 51 99 84 75 78 51 55 79 91 88 63 83 87
## [127] 72 65 82 51 89 53 87 75 74 58 51 70 59 71 76 59 42 57
## [145] 88 22 88 73 68 100 62 77 59 54 62 70 66 60 61 66 82 75
## [163] 49 52 81 96 53 58 68 67 72 94 79 63 43 81 46 71 52 97
## [181] 62 46 50 65 45 65 80 62 48 77 66 76 62 77 69 61 59 55
## [199] 45 78 67 65 69 57 59 74 82 81 74 58 80 35 42 60 87 84
## [217] 83 34 66 61 56 87 55 86 52 45 72 57 68 88 76 46 67 92
## [235] 83 80 63 64 54 84 73 80 56 59 75 85 89 58 65 68 47 71
## [253] 60 80 54 62 64 78 70 65 64 79 44 99 76 59 63 69 88 71
## [271] 69 58 47 65 88 83 85 59 65 73 53 45 73 70 37 81 97 67
## [289] 88 77 76 86 63 65 78 67 46 71 40 90 81 56 67 80 74 69
## [307] 99 51 53 49 73 66 67 68 59 71 77 83 63 56 67 75 71 43
## [325] 41 82 61 28 82 41 71 47 62 90 83 61 76 49 24 35 58 61
## [343] 69 67 79 72 62 77 75 87 52 66 63 46 59 61 63 42 59 80
## [361] 58 85 52 27 59 49 69 61 44 73 84 45 74 82 59 46 80 85
## [379] 71 66 80 87 79 38 38 67 64 57 62 73 73 77 76 57 65 48
## [397] 50 85 74 60 59 53 49 88 54 63 65 82 52 87 70 84 71 63
## [415] 51 84 71 74 68 57 82 57 47 59 41 62 86 69 65 68 64 61
## [433] 61 47 73 50 75 75 70 89 67 78 59 73 79 67 69 86 47 81
## [451] 64 100 65 65 53 37 79 53 100 72 53 54 71 77 75 84 26 72
## [469] 77 91 83 63 68 59 90 71 76 80 55 76 73 52 68 59 49 70
## [487] 61 60 64 79 65 64 83 81 54 68 54 59 66 76 74 94 63 95
## [505] 40 82 68 55 79 86 76 64 62 54 77 76 74 66 66 67 71 91
## [523] 69 54 53 68 56 36 29 62 68 47 62 79 73 66 51 51 85 97
## [541] 75 79 81 82 64 78 92 72 62 79 79 87 40 77 53 32 55 61
## [559] 53 73 74 63 96 63 48 48 92 61 63 68 71 91 53 50 74 40
## [577] 61 81 48 53 81 77 63 73 69 65 55 44 54 48 58 71 68 74
## [595] 92 56 30 53 69 65 54 29 76 60 84 75 85 40 61 58 69 58
## [613] 94 65 82 60 37 88 95 65 35 62 58 100 61 100 69 61 49 44
## [631] 67 79 66 75 84 71 67 80 86 76 41 74 72 74 70 65 59 64
## [649] 50 69 51 68 85 65 73 62 77 69 43 90 74 73 55 65 80 50
## [667] 63 77 73 81 66 52 69 65 69 50 73 70 81 63 67 60 62 29
## [685] 62 94 85 77 53 93 49 73 66 77 49 79 75 59 57 66 79 57
## [703] 87 63 59 62 46 66 89 42 93 80 98 81 60 76 73 96 76 91
## [721] 62 55 74 50 47 81 65 68 73 53 68 55 87 55 53 67 92 53
## [739] 81 61 80 37 81 59 55 72 69 69 50 87 71 68 79 77 58 84
## [757] 55 70 52 69 53 48 78 62 60 74 58 76 68 58 52 75 52 62
## [775] 66 49 66 35 72 94 46 77 76 52 91 32 72 19 68 52 48 60
## [793] 66 89 42 57 70 70 69 52 67 76 87 82 73 75 64 41 90 59
## [811] 51 45 54 87 72 94 45 61 60 77 85 78 49 71 48 62 56 65
## [829] 69 68 61 74 64 77 58 60 73 75 58 66 39 64 23 74 40 90
## [847] 91 64 59 80 71 61 87 82 62 97 75 65 52 87 53 81 39 71
## [865] 97 82 59 61 78 49 59 70 82 90 43 80 81 57 59 64 63 71
## [883] 64 55 51 62 93 54 69 44 86 85 50 88 59 32 36 63 67 65
## [901] 85 73 34 93 67 88 57 79 67 70 50 69 52 47 46 68 100 44
## [919] 57 91 69 35 72 54 74 74 64 65 46 48 67 62 61 70 98 70
## [937] 67 57 85 77 72 78 81 61 58 54 82 49 49 57 94 75 74 58
## [955] 62 72 84 92 45 75 56 48 100 65 72 62 66 63 68 75 89 78
## [973] 53 49 54 64 60 62 55 91 8 81 79 78 74 57 40 81 44 67
## [991] 86 65 55 62 63 88 62 59 68 77
###map():
library(purrr)
math <- student_data$math.score
reading <- student_data$reading.score
numbers <- list(math, reading)
map(numbers, sum)
## [[1]]
## [1] 66089
##
## [[2]]
## [1] 69169
map_dbl(numbers, mean)
## [1] 66.089 69.169