Assignment 1: Importing data from different sources

Import data from csv file and do some functions on it.

options(repos = c(CRAN = “https://cloud.r-project.org”)) install.packages(“readxl”)

library(readxl)

student_data <- read.csv("C:\\Moses\\StudentsPerformance.csv")

str(student_data)
## 'data.frame':    1000 obs. of  8 variables:
##  $ gender                     : chr  "female" "female" "female" "male" ...
##  $ ethnicity                  : chr  "group B" "group C" "group B" "group A" ...
##  $ parental.level.of.education: chr  "bachelor's degree" "some college" "master's degree" "associate's degree" ...
##  $ lunch                      : chr  "standard" "standard" "standard" "free/reduced" ...
##  $ test.preparation.course    : chr  "none" "completed" "none" "none" ...
##  $ math.score                 : int  72 69 90 47 76 71 88 40 64 38 ...
##  $ reading.score              : int  72 90 95 57 78 83 95 43 64 60 ...
##  $ writing.score              : int  74 88 93 44 75 78 92 39 67 50 ...
head(student_data)
##   gender ethnicity parental.level.of.education        lunch
## 1 female   group B           bachelor's degree     standard
## 2 female   group C                some college     standard
## 3 female   group B             master's degree     standard
## 4   male   group A          associate's degree free/reduced
## 5   male   group C                some college     standard
## 6 female   group B          associate's degree     standard
##   test.preparation.course math.score reading.score writing.score
## 1                    none         72            72            74
## 2               completed         69            90            88
## 3                    none         90            95            93
## 4                    none         47            57            44
## 5                    none         76            78            75
## 6                    none         71            83            78
tail(student_data)
##      gender ethnicity parental.level.of.education        lunch
## 995    male   group A                 high school     standard
## 996  female   group E             master's degree     standard
## 997    male   group C                 high school free/reduced
## 998  female   group C                 high school free/reduced
## 999  female   group D                some college     standard
## 1000 female   group D                some college free/reduced
##      test.preparation.course math.score reading.score writing.score
## 995                     none         63            63            62
## 996                completed         88            99            95
## 997                     none         62            55            55
## 998                completed         59            71            65
## 999                completed         68            78            77
## 1000                    none         77            86            86
View(student_data)

data <- student_data[2:3,]
print(data)
##   gender ethnicity parental.level.of.education    lunch test.preparation.course
## 2 female   group C                some college standard               completed
## 3 female   group B             master's degree standard                    none
##   math.score reading.score writing.score
## 2         69            90            88
## 3         90            95            93
summary(student_data$writing.score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   57.75   69.00   68.05   79.00  100.00
hist(student_data$reading.score, col = "blue")

Importing data from MySql server

LOAD LIBRARY

library(RMySQL)
## Loading required package: DBI

CONNECT TO MYSQL DATABASE

con <- dbConnect(
  MySQL(),
  user = "root",
  password = "Motta@123",
  host = "localhost",
  port = 3306,
  dbname = "sakila"
)

SHOW AVAILABLE TABLES

tables <- dbListTables(con)
print(tables)
##  [1] "actor"                      "actor_info"                
##  [3] "address"                    "category"                  
##  [5] "city"                       "country"                   
##  [7] "customer"                   "customer_list"             
##  [9] "film"                       "film_actor"                
## [11] "film_category"              "film_list"                 
## [13] "film_text"                  "inventory"                 
## [15] "language"                   "nicer_but_slower_film_list"
## [17] "payment"                    "rental"                    
## [19] "sales_by_film_category"     "sales_by_store"            
## [21] "staff"                      "staff_list"                
## [23] "store"

Assignment 2: Merging datasets with 2 to 3 variables

###Example 1: Merge Using Two Variables

df1 <- data.frame(
  id = c(1, 2, 3, 4),
  subject = c("Python", "C programming", "Python", "R programming"),
  name = c("Moses", "Kevine", "Paul", "Jackson")
)

df2 <- data.frame(
  id = c(1, 2, 3, 4),
  subject = c("Python", "C programming", "Python", "R programming"),
  marks = c(80, 90, 75, 85)
)

merged_data <- merge(df1, df2, by = c("id", "subject"))

merged_data
##   id       subject    name marks
## 1  1        Python   Moses    80
## 2  2 C programming  Kevine    90
## 3  3        Python    Paul    75
## 4  4 R programming Jackson    85

###Example 2: Merge with Different Column Names (Left Join Style)

df1 <- data.frame(
  student_id = c(1, 2, 3, 4),
  subject = c("Python", "C programming", "Python", "R programming"),
  name = c("Moses", "Kevine", "Paul", "Jackson")
)

df2 <- data.frame(
  id = c(1, 2, 3, 4),
  subject = c("Python", "C programming", "Python", "R programming"),
  marks = c(80, 90, 75, 85)
)

merged_data <- merge(df1, df2,
                     by.x = "student_id",
                     by.y = "id",
                     all.x = TRUE)

merged_data
##   student_id     subject.x    name     subject.y marks
## 1          1        Python   Moses        Python    80
## 2          2 C programming  Kevine C programming    90
## 3          3        Python    Paul        Python    75
## 4          4 R programming Jackson R programming    85

Assignment 3: How to use these functions: select(),filter(), arrange(), rename(), mutate(), group_by, %>%

###select()

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
student <- student_data %>% select (gender, ethnicity, lunch, math.score)
head(student)
##   gender ethnicity        lunch math.score
## 1 female   group B     standard         72
## 2 female   group C     standard         69
## 3 female   group B     standard         90
## 4   male   group A free/reduced         47
## 5   male   group C     standard         76
## 6 female   group B     standard         71

###filter()

library(dplyr)
student<- student_data %>% filter(gender == "female" & math.score > 95)
head(student)
##   gender ethnicity parental.level.of.education    lunch test.preparation.course
## 1 female   group E           bachelor's degree standard               completed
## 2 female   group C           bachelor's degree standard               completed
## 3 female   group D            some high school standard               completed
## 4 female   group E                 high school standard                    none
## 5 female   group E                some college standard                    none
## 6 female   group E           bachelor's degree standard                    none
##   math.score reading.score writing.score
## 1         99           100           100
## 2         96           100           100
## 3         97           100           100
## 4         99            93            90
## 5        100            92            97
## 6        100           100           100

###arrange()

library(dplyr)
student <- student_data %>% arrange(gender, desc(math.score))
head(student)
##   gender ethnicity parental.level.of.education    lunch test.preparation.course
## 1 female   group E                some college standard                    none
## 2 female   group E           bachelor's degree standard                    none
## 3 female   group E          associate's degree standard                    none
## 4 female   group E           bachelor's degree standard               completed
## 5 female   group E                 high school standard                    none
## 6 female   group D                some college standard                    none
##   math.score reading.score writing.score
## 1        100            92            97
## 2        100           100           100
## 3        100           100           100
## 4         99           100           100
## 5         99            93            90
## 6         98           100            99

###rename()

student <- student_data %>% rename(student_gender = gender, math_marks = math.score)
head(student)
##   student_gender ethnicity parental.level.of.education        lunch
## 1         female   group B           bachelor's degree     standard
## 2         female   group C                some college     standard
## 3         female   group B             master's degree     standard
## 4           male   group A          associate's degree free/reduced
## 5           male   group C                some college     standard
## 6         female   group B          associate's degree     standard
##   test.preparation.course math_marks reading.score writing.score
## 1                    none         72            72            74
## 2               completed         69            90            88
## 3                    none         90            95            93
## 4                    none         47            57            44
## 5                    none         76            78            75
## 6                    none         71            83            78

###mutate()

student <- student_data %>% mutate(total_marks = math.score + reading.score + writing.score) ## adding new column of total score
head(student)
##   gender ethnicity parental.level.of.education        lunch
## 1 female   group B           bachelor's degree     standard
## 2 female   group C                some college     standard
## 3 female   group B             master's degree     standard
## 4   male   group A          associate's degree free/reduced
## 5   male   group C                some college     standard
## 6 female   group B          associate's degree     standard
##   test.preparation.course math.score reading.score writing.score total_marks
## 1                    none         72            72            74         218
## 2               completed         69            90            88         247
## 3                    none         90            95            93         278
## 4                    none         47            57            44         148
## 5                    none         76            78            75         229
## 6                    none         71            83            78         232

###mutate()

student <- student_data %>% mutate(total_marks = math.score + reading.score + writing.score, avg = (math.score + reading.score + writing.score)/3)
head(student)
##   gender ethnicity parental.level.of.education        lunch
## 1 female   group B           bachelor's degree     standard
## 2 female   group C                some college     standard
## 3 female   group B             master's degree     standard
## 4   male   group A          associate's degree free/reduced
## 5   male   group C                some college     standard
## 6 female   group B          associate's degree     standard
##   test.preparation.course math.score reading.score writing.score total_marks
## 1                    none         72            72            74         218
## 2               completed         69            90            88         247
## 3                    none         90            95            93         278
## 4                    none         47            57            44         148
## 5                    none         76            78            75         229
## 6                    none         71            83            78         232
##        avg
## 1 72.66667
## 2 82.33333
## 3 92.66667
## 4 49.33333
## 5 76.33333
## 6 77.33333

###mutate()

student <- student_data %>% mutate(total_marks = math.score + reading.score + writing.score, avg = (math.score + reading.score + writing.score)/3, result = ifelse(((math.score + reading.score + writing.score)/3) >= 50, "Pass", "Fail"))
head(student)
##   gender ethnicity parental.level.of.education        lunch
## 1 female   group B           bachelor's degree     standard
## 2 female   group C                some college     standard
## 3 female   group B             master's degree     standard
## 4   male   group A          associate's degree free/reduced
## 5   male   group C                some college     standard
## 6 female   group B          associate's degree     standard
##   test.preparation.course math.score reading.score writing.score total_marks
## 1                    none         72            72            74         218
## 2               completed         69            90            88         247
## 3                    none         90            95            93         278
## 4                    none         47            57            44         148
## 5                    none         76            78            75         229
## 6                    none         71            83            78         232
##        avg result
## 1 72.66667   Pass
## 2 82.33333   Pass
## 3 92.66667   Pass
## 4 49.33333   Fail
## 5 76.33333   Pass
## 6 77.33333   Pass

GROUP_BY

student <- student_data %>% group_by(gender) %>% summarise(avg = mean(math.score))
head(student)
## # A tibble: 2 × 2
##   gender   avg
##   <chr>  <dbl>
## 1 female  63.6
## 2 male    68.7

Assignment 4: How to use trace(), recover() functions

using trace() function to find the average marks of mathematics subject and this is used to modify or inspect a function during execution.

math_marks <- student_data$math.score
myfun <- function(math_marks) {
  avg <- mean(math_marks)
  print(avg)
  
}
trace(myfun, quote(cat("Function is running\n")), print = FALSE)
## [1] "myfun"
myfun(math_marks)
## Function is running
## [1] 66.089

recover() function is used in debugging error environments, When an error occurs, it lets you inspect function calls step-by-step

enable recover function

options(error = recover)

###Example 1: Division by 0

f <- function(x) {
  y <- x / 0
  y
}
f(10)
## [1] Inf

Manual summary function to give statistics

marks <- student_data$math.score

my_summary <- function(math_marks) {

  # Sort the vector
  x <- sort(marks)

  n <- length(marks)

  # Minimum
  minimum <- marks[1]

  # Maximum
  maximum <- marks[n]

  # Mean
  total <- 0

  for(i in marks) {
    total <- total + i
  }

  mean_value <- total / n

  # Median
  if(n %% 2 == 1) {

    median_value <- marks[(n + 1) / 2]

  } else {

    median_value <- (x[n/2] + x[(n/2) + 1]) / 2
  }

  # First Quartile (Q1)
  q1_position <- (n + 1) / 4
  q1 <- x[round(q1_position)]

  # Third Quartile (Q3)
  q3_position <- 3 * (n + 1) / 4
  q3 <- x[round(q3_position)]

  # Display results
  cat("Minimum :", minimum, "\n")
  cat("1st Quartile :", q1, "\n")
  cat("Median :", median_value, "\n")
  cat("Mean :", mean_value, "\n")
  cat("3rd Quartile :", q3, "\n")
  cat("Maximum :", maximum, "\n")
}

my_summary(marks)
## Minimum : 72 
## 1st Quartile : 57 
## Median : 66 
## Mean : 66.089 
## 3rd Quartile : 77 
## Maximum : 77

Assignment 5: Manual Two-Sample t-Test Function

ttest_function <- function(x, y) {

  # Find lengths manually

  n1 <- 0

  for(i in x) {
    n1 <- n1 + 1
  }

  n2 <- 0

  for(i in y) {
    n2 <- n2 + 1
  }

  # Compute means manually

  sum1 <- 0

  for(i in x) {
    sum1 <- sum1 + i
  }

  mean1 <- sum1 / n1

  sum2 <- 0

  for(i in y) {
    sum2 <- sum2 + i
  }

  mean2 <- sum2 / n2

  # Compute variances manually

  ss1 <- 0

  for(i in x) {
    ss1 <- ss1 + (i - mean1)^2
  }

  var1 <- ss1 / (n1 - 1)

  ss2 <- 0

  for(i in y) {
    ss2 <- ss2 + (i - mean2)^2
  }

  var2 <- ss2 / (n2 - 1)

  # Pooled variance

  pooled_var <- (
    ((n1 - 1) * var1) +
    ((n2 - 1) * var2)
  ) / (n1 + n2 - 2)

  # Standard error

  se <- sqrt(
    pooled_var * ((1 / n1) + (1 / n2))
  )

  # t statistic

  t_value <- (mean1 - mean2) / se

  # Print results

  cat("Sample 1 Mean :", mean1, "\n")
  cat("Sample 2 Mean :", mean2, "\n")
  cat("Variance 1 :", var1, "\n")
  cat("Variance 2 :", var2, "\n")
  cat("t statistic :", t_value, "\n")
}

x <- student_data$math.score
y <- student_data$reading.score
ttest_function(x,y)
## Sample 1 Mean : 66.089 
## Sample 2 Mean : 69.169 
## Variance 1 : 229.919 
## Variance 2 : 213.1656 
## t statistic : -4.627084

Assignment 6: sapply() and vapply(), variants of lapply() that produces vectors,matrices, and arrays as Output, instead of lists map(), and mapply which iterate over multiple input data structures in parallel

###lapply(): Applies a function to each element and keeps output in a list

math <- student_data$math.score
reading <- student_data$reading.score
numbers <- list(math,reading)

lapply(numbers, sum)
## [[1]]
## [1] 66089
## 
## [[2]]
## [1] 69169

###sapply() : Like lapply(), but tries to simplify output.

math <- student_data$math.score
reading <- student_data$reading.score
numbers <- list(math,reading)
sapply(numbers, sum)
## [1] 66089 69169

###vapply(): You MUST specify expected output type

math <- student_data$math.score
reading <- student_data$reading.score
numbers <- list(math,reading)

vapply(numbers, sum, numeric(1))
## [1] 66089 69169

###mapply(): Applies function across multiple vectors at the same time.

math <- student_data$math.score
reading <- student_data$reading.score
mapply(sum, math)
##    [1]  72  69  90  47  76  71  88  40  64  38  58  40  65  78  50  69  88  18
##   [19]  46  54  66  65  44  69  74  73  69  67  70  62  69  63  56  40  97  81
##   [37]  74  50  75  57  55  58  53  59  50  65  55  66  57  82  53  77  53  88
##   [55]  71  33  82  52  58   0  79  39  62  69  59  67  45  60  61  39  58  63
##   [73]  41  61  49  44  30  80  61  62  47  49  50  72  42  73  76  71  58  73
##   [91]  65  27  71  43  79  78  65  63  58  65  79  68  85  60  98  58  87  66
##  [109]  52  70  77  62  54  51  99  84  75  78  51  55  79  91  88  63  83  87
##  [127]  72  65  82  51  89  53  87  75  74  58  51  70  59  71  76  59  42  57
##  [145]  88  22  88  73  68 100  62  77  59  54  62  70  66  60  61  66  82  75
##  [163]  49  52  81  96  53  58  68  67  72  94  79  63  43  81  46  71  52  97
##  [181]  62  46  50  65  45  65  80  62  48  77  66  76  62  77  69  61  59  55
##  [199]  45  78  67  65  69  57  59  74  82  81  74  58  80  35  42  60  87  84
##  [217]  83  34  66  61  56  87  55  86  52  45  72  57  68  88  76  46  67  92
##  [235]  83  80  63  64  54  84  73  80  56  59  75  85  89  58  65  68  47  71
##  [253]  60  80  54  62  64  78  70  65  64  79  44  99  76  59  63  69  88  71
##  [271]  69  58  47  65  88  83  85  59  65  73  53  45  73  70  37  81  97  67
##  [289]  88  77  76  86  63  65  78  67  46  71  40  90  81  56  67  80  74  69
##  [307]  99  51  53  49  73  66  67  68  59  71  77  83  63  56  67  75  71  43
##  [325]  41  82  61  28  82  41  71  47  62  90  83  61  76  49  24  35  58  61
##  [343]  69  67  79  72  62  77  75  87  52  66  63  46  59  61  63  42  59  80
##  [361]  58  85  52  27  59  49  69  61  44  73  84  45  74  82  59  46  80  85
##  [379]  71  66  80  87  79  38  38  67  64  57  62  73  73  77  76  57  65  48
##  [397]  50  85  74  60  59  53  49  88  54  63  65  82  52  87  70  84  71  63
##  [415]  51  84  71  74  68  57  82  57  47  59  41  62  86  69  65  68  64  61
##  [433]  61  47  73  50  75  75  70  89  67  78  59  73  79  67  69  86  47  81
##  [451]  64 100  65  65  53  37  79  53 100  72  53  54  71  77  75  84  26  72
##  [469]  77  91  83  63  68  59  90  71  76  80  55  76  73  52  68  59  49  70
##  [487]  61  60  64  79  65  64  83  81  54  68  54  59  66  76  74  94  63  95
##  [505]  40  82  68  55  79  86  76  64  62  54  77  76  74  66  66  67  71  91
##  [523]  69  54  53  68  56  36  29  62  68  47  62  79  73  66  51  51  85  97
##  [541]  75  79  81  82  64  78  92  72  62  79  79  87  40  77  53  32  55  61
##  [559]  53  73  74  63  96  63  48  48  92  61  63  68  71  91  53  50  74  40
##  [577]  61  81  48  53  81  77  63  73  69  65  55  44  54  48  58  71  68  74
##  [595]  92  56  30  53  69  65  54  29  76  60  84  75  85  40  61  58  69  58
##  [613]  94  65  82  60  37  88  95  65  35  62  58 100  61 100  69  61  49  44
##  [631]  67  79  66  75  84  71  67  80  86  76  41  74  72  74  70  65  59  64
##  [649]  50  69  51  68  85  65  73  62  77  69  43  90  74  73  55  65  80  50
##  [667]  63  77  73  81  66  52  69  65  69  50  73  70  81  63  67  60  62  29
##  [685]  62  94  85  77  53  93  49  73  66  77  49  79  75  59  57  66  79  57
##  [703]  87  63  59  62  46  66  89  42  93  80  98  81  60  76  73  96  76  91
##  [721]  62  55  74  50  47  81  65  68  73  53  68  55  87  55  53  67  92  53
##  [739]  81  61  80  37  81  59  55  72  69  69  50  87  71  68  79  77  58  84
##  [757]  55  70  52  69  53  48  78  62  60  74  58  76  68  58  52  75  52  62
##  [775]  66  49  66  35  72  94  46  77  76  52  91  32  72  19  68  52  48  60
##  [793]  66  89  42  57  70  70  69  52  67  76  87  82  73  75  64  41  90  59
##  [811]  51  45  54  87  72  94  45  61  60  77  85  78  49  71  48  62  56  65
##  [829]  69  68  61  74  64  77  58  60  73  75  58  66  39  64  23  74  40  90
##  [847]  91  64  59  80  71  61  87  82  62  97  75  65  52  87  53  81  39  71
##  [865]  97  82  59  61  78  49  59  70  82  90  43  80  81  57  59  64  63  71
##  [883]  64  55  51  62  93  54  69  44  86  85  50  88  59  32  36  63  67  65
##  [901]  85  73  34  93  67  88  57  79  67  70  50  69  52  47  46  68 100  44
##  [919]  57  91  69  35  72  54  74  74  64  65  46  48  67  62  61  70  98  70
##  [937]  67  57  85  77  72  78  81  61  58  54  82  49  49  57  94  75  74  58
##  [955]  62  72  84  92  45  75  56  48 100  65  72  62  66  63  68  75  89  78
##  [973]  53  49  54  64  60  62  55  91   8  81  79  78  74  57  40  81  44  67
##  [991]  86  65  55  62  63  88  62  59  68  77

###map():

library(purrr)
math <- student_data$math.score
reading <- student_data$reading.score

numbers <- list(math, reading)

map(numbers, sum)
## [[1]]
## [1] 66089
## 
## [[2]]
## [1] 69169
map_dbl(numbers, mean)
## [1] 66.089 69.169