# Load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(dplyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.3 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(ggplot2)
mpg<- read_delim("C:/Users/kondo/OneDrive/Desktop/INTRO to Statistics and R/Data Set and work/data.csv", delim = ";",show_col_types = FALSE)
glimpse(mpg)
## Rows: 4,424
## Columns: 37
## $ `Marital status` <dbl> 1, 1, 1, 1, 2, 2, 1, …
## $ `Application mode` <dbl> 17, 15, 1, 17, 39, 39…
## $ `Application order` <dbl> 5, 1, 5, 2, 1, 1, 1, …
## $ Course <dbl> 171, 9254, 9070, 9773…
## $ `Daytime/evening attendance\t` <dbl> 1, 1, 1, 1, 0, 0, 1, …
## $ `Previous qualification` <dbl> 1, 1, 1, 1, 1, 19, 1,…
## $ `Previous qualification (grade)` <dbl> 122.0, 160.0, 122.0, …
## $ Nacionality <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ `Mother's qualification` <dbl> 19, 1, 37, 38, 37, 37…
## $ `Father's qualification` <dbl> 12, 3, 37, 37, 38, 37…
## $ `Mother's occupation` <dbl> 5, 3, 9, 5, 9, 9, 7, …
## $ `Father's occupation` <dbl> 9, 3, 9, 3, 9, 7, 10,…
## $ `Admission grade` <dbl> 127.3, 142.5, 124.8, …
## $ Displaced <dbl> 1, 1, 1, 1, 0, 0, 1, …
## $ `Educational special needs` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ Debtor <dbl> 0, 0, 0, 0, 0, 1, 0, …
## $ `Tuition fees up to date` <dbl> 1, 0, 0, 1, 1, 1, 1, …
## $ Gender <dbl> 1, 1, 1, 0, 0, 1, 0, …
## $ `Scholarship holder` <dbl> 0, 0, 0, 0, 0, 0, 1, …
## $ `Age at enrollment` <dbl> 20, 19, 19, 20, 45, 5…
## $ International <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `Curricular units 1st sem (credited)` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `Curricular units 1st sem (enrolled)` <dbl> 0, 6, 6, 6, 6, 5, 7, …
## $ `Curricular units 1st sem (evaluations)` <dbl> 0, 6, 0, 8, 9, 10, 9,…
## $ `Curricular units 1st sem (approved)` <dbl> 0, 6, 0, 6, 5, 5, 7, …
## $ `Curricular units 1st sem (grade)` <dbl> 0.00000, 14.00000, 0.…
## $ `Curricular units 1st sem (without evaluations)` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `Curricular units 2nd sem (credited)` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `Curricular units 2nd sem (enrolled)` <dbl> 0, 6, 6, 6, 6, 5, 8, …
## $ `Curricular units 2nd sem (evaluations)` <dbl> 0, 6, 0, 10, 6, 17, 8…
## $ `Curricular units 2nd sem (approved)` <dbl> 0, 6, 0, 5, 6, 5, 8, …
## $ `Curricular units 2nd sem (grade)` <dbl> 0.00000, 13.66667, 0.…
## $ `Curricular units 2nd sem (without evaluations)` <dbl> 0, 0, 0, 0, 0, 5, 0, …
## $ `Unemployment rate` <dbl> 10.8, 13.9, 10.8, 9.4…
## $ `Inflation rate` <dbl> 1.4, -0.3, 1.4, -0.8,…
## $ GDP <dbl> 1.74, 0.79, 1.74, -3.…
## $ Target <chr> "Dropout", "Graduate"…
subsample_list <- list()
histograms_and_boxplots <- list()
# Set the number of subsamples
num_subsamples <- 6
# percentage of data to include in each subsample
sample_percentage <- 0.5
# Define the number of rows in the original dataset
num_rows <- nrow(mpg)
# Set the number of rows in each subsample
sample_size <- round(num_rows * sample_percentage)
for (i in 1:num_subsamples) {
sample_indices <- sample(1:num_rows, size = sample_size, replace = TRUE)
#get the columns of choice by column numbers
columns_to_select <- c(4:7, (ncol(mpg) - 1):ncol(mpg))
subsample_df <- mpg[sample_indices, columns_to_select]
subsample_list[[i]] <- subsample_df
}
# Create an empty list to store the subsamples
subsample_1 <- subsample_list[[1]]
subsample_2 <- subsample_list[[2]]
subsample_3 <- subsample_list[[3]]
subsample_4 <- subsample_list[[4]]
subsample_5 <- subsample_list[[5]]
subsample_6 <- subsample_list[[6]]
head(subsample_1)
## # A tibble: 6 × 6
## Course Daytime/evening attenda…¹ Previous qualificati…² Previous qualificati…³
## <dbl> <dbl> <dbl> <dbl>
## 1 9500 1 1 120
## 2 171 1 1 151
## 3 9500 1 1 122
## 4 9500 1 1 136
## 5 9500 1 6 133
## 6 9238 1 1 127
## # ℹ abbreviated names: ¹`Daytime/evening attendance\t`,
## # ²`Previous qualification`, ³`Previous qualification (grade)`
## # ℹ 2 more variables: GDP <dbl>, Target <chr>
head(subsample_2)
## # A tibble: 6 × 6
## Course Daytime/evening attenda…¹ Previous qualificati…² Previous qualificati…³
## <dbl> <dbl> <dbl> <dbl>
## 1 9147 1 1 132
## 2 9130 1 1 152
## 3 9238 1 1 133
## 4 9500 1 1 102
## 5 9147 1 1 120
## 6 9853 1 1 133.
## # ℹ abbreviated names: ¹`Daytime/evening attendance\t`,
## # ²`Previous qualification`, ³`Previous qualification (grade)`
## # ℹ 2 more variables: GDP <dbl>, Target <chr>
head(subsample_3)
## # A tibble: 6 × 6
## Course Daytime/evening attenda…¹ Previous qualificati…² Previous qualificati…³
## <dbl> <dbl> <dbl> <dbl>
## 1 9003 1 39 150
## 2 9991 0 39 130
## 3 9853 1 1 120
## 4 9147 1 1 156
## 5 9070 1 1 133.
## 6 9238 1 1 138
## # ℹ abbreviated names: ¹`Daytime/evening attendance\t`,
## # ²`Previous qualification`, ³`Previous qualification (grade)`
## # ℹ 2 more variables: GDP <dbl>, Target <chr>
head(subsample_4)
## # A tibble: 6 × 6
## Course Daytime/evening attenda…¹ Previous qualificati…² Previous qualificati…³
## <dbl> <dbl> <dbl> <dbl>
## 1 9085 1 1 140
## 2 9991 0 1 150
## 3 9085 1 1 184.
## 4 9119 1 1 130
## 5 9853 1 1 168
## 6 9670 1 1 121
## # ℹ abbreviated names: ¹`Daytime/evening attendance\t`,
## # ²`Previous qualification`, ³`Previous qualification (grade)`
## # ℹ 2 more variables: GDP <dbl>, Target <chr>
head(subsample_5)
## # A tibble: 6 × 6
## Course Daytime/evening attenda…¹ Previous qualificati…² Previous qualificati…³
## <dbl> <dbl> <dbl> <dbl>
## 1 9500 1 1 150
## 2 9500 1 1 132
## 3 9003 1 39 160
## 4 171 1 1 159
## 5 9003 1 39 140
## 6 9773 1 1 122
## # ℹ abbreviated names: ¹`Daytime/evening attendance\t`,
## # ²`Previous qualification`, ³`Previous qualification (grade)`
## # ℹ 2 more variables: GDP <dbl>, Target <chr>
head(subsample_6)
## # A tibble: 6 × 6
## Course Daytime/evening attenda…¹ Previous qualificati…² Previous qualificati…³
## <dbl> <dbl> <dbl> <dbl>
## 1 9500 1 1 139
## 2 9500 1 3 120
## 3 9556 1 19 133.
## 4 9773 1 1 128
## 5 9853 1 1 133.
## 6 9500 1 1 160
## # ℹ abbreviated names: ¹`Daytime/evening attendance\t`,
## # ²`Previous qualification`, ³`Previous qualification (grade)`
## # ℹ 2 more variables: GDP <dbl>, Target <chr>
Scrutinizing and drawing conclusions from above subsamples: 1) In Sub-Sample 1, a GDP value of -3.12 might be considered an anomaly due to its extreme negative deviation. 2) In Sub-Sample 3, a GDP value of 3.51 could be considered an anomaly due to its significant positive deviation. 3)The presence of a Course ID is consistent across all sub-samples, allowing for the identification of the course associated with each set of data. 4)Daytime/Evening Attendance, Previous Qualification, Previous Qualification (Grade), and GDP are attributes common to all sub-samples. 5)There seems to be variation within each sub-sample for these attributes, but there is no consistency in the specific values. Each sub-sample represents different courses or scenarios. 6)The differences among sub-samples suggest that each sub-sample represents a different context or scenario related to courses. Conclusions drawn from one sub-sample may not be generalizable to the others. 7)The identification of anomalies varies depending on the distribution of data within each sub-sample. What’s considered an anomaly in one sub-sample might not be in another.