library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

For this week’s data dive, I would be using the dataset_1 which is a reformed version my my original data set and is same as what I used in last two data dives.

dataset <-read_delim("C:/Users/MSKR/MASTER'S_ADS/STATISTICS_SEM1/DATA_SET_1.csv", delim = ",")
## Rows: 4424 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Target
## dbl (36): Marital status, Application mode, Application order, Course, Dayti...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dataset_1<-mutate(dataset, marital_status = ifelse(dataset$`Marital status` == 1, "single",
                    ifelse(`Marital status` == 2, "married",
                    ifelse(`Marital status` == 3, "widower",
                    ifelse(`Marital status` == 4, "divorced",
                    ifelse(`Marital status` == 5, "facto union",
                    ifelse(`Marital status` == 6, "legally seperated", "no")))))))
dataset_1<-mutate(dataset_1,gender=ifelse(Gender==1,"male","female")) 
summary(dataset_1)
##  Marital status  Application mode Application order     Course    
##  Min.   :1.000   Min.   : 1.00    Min.   :0.000     Min.   :  33  
##  1st Qu.:1.000   1st Qu.: 1.00    1st Qu.:1.000     1st Qu.:9085  
##  Median :1.000   Median :17.00    Median :1.000     Median :9238  
##  Mean   :1.179   Mean   :18.67    Mean   :1.728     Mean   :8857  
##  3rd Qu.:1.000   3rd Qu.:39.00    3rd Qu.:2.000     3rd Qu.:9556  
##  Max.   :6.000   Max.   :57.00    Max.   :9.000     Max.   :9991  
##  Daytime/evening attendance\t Previous qualification
##  Min.   :0.0000               Min.   : 1.000        
##  1st Qu.:1.0000               1st Qu.: 1.000        
##  Median :1.0000               Median : 1.000        
##  Mean   :0.8908               Mean   : 4.578        
##  3rd Qu.:1.0000               3rd Qu.: 1.000        
##  Max.   :1.0000               Max.   :43.000        
##  Previous qualification (grade)  Nacionality      Mother's qualification
##  Min.   : 95.0                  Min.   :  1.000   Min.   : 1.00         
##  1st Qu.:125.0                  1st Qu.:  1.000   1st Qu.: 2.00         
##  Median :133.1                  Median :  1.000   Median :19.00         
##  Mean   :132.6                  Mean   :  1.873   Mean   :19.56         
##  3rd Qu.:140.0                  3rd Qu.:  1.000   3rd Qu.:37.00         
##  Max.   :190.0                  Max.   :109.000   Max.   :44.00         
##  Father's qualification Mother's occupation Father's occupation Admission grade
##  Min.   : 1.00          Min.   :  0.00      Min.   :  0.00      Min.   : 95.0  
##  1st Qu.: 3.00          1st Qu.:  4.00      1st Qu.:  4.00      1st Qu.:117.9  
##  Median :19.00          Median :  5.00      Median :  7.00      Median :126.1  
##  Mean   :22.28          Mean   : 10.96      Mean   : 11.03      Mean   :127.0  
##  3rd Qu.:37.00          3rd Qu.:  9.00      3rd Qu.:  9.00      3rd Qu.:134.8  
##  Max.   :44.00          Max.   :194.00      Max.   :195.00      Max.   :190.0  
##    Displaced      Educational special needs     Debtor      
##  Min.   :0.0000   Min.   :0.00000           Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000           1st Qu.:0.0000  
##  Median :1.0000   Median :0.00000           Median :0.0000  
##  Mean   :0.5484   Mean   :0.01153           Mean   :0.1137  
##  3rd Qu.:1.0000   3rd Qu.:0.00000           3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.00000           Max.   :1.0000  
##  Tuition fees up to date     Gender       Scholarship holder Age at enrollment
##  Min.   :0.0000          Min.   :0.0000   Min.   :0.0000     Min.   :17.00    
##  1st Qu.:1.0000          1st Qu.:0.0000   1st Qu.:0.0000     1st Qu.:19.00    
##  Median :1.0000          Median :0.0000   Median :0.0000     Median :20.00    
##  Mean   :0.8807          Mean   :0.3517   Mean   :0.2484     Mean   :23.27    
##  3rd Qu.:1.0000          3rd Qu.:1.0000   3rd Qu.:0.0000     3rd Qu.:25.00    
##  Max.   :1.0000          Max.   :1.0000   Max.   :1.0000     Max.   :70.00    
##  International     Curricular units 1st sem (credited)
##  Min.   :0.00000   Min.   : 0.00                      
##  1st Qu.:0.00000   1st Qu.: 0.00                      
##  Median :0.00000   Median : 0.00                      
##  Mean   :0.02486   Mean   : 0.71                      
##  3rd Qu.:0.00000   3rd Qu.: 0.00                      
##  Max.   :1.00000   Max.   :20.00                      
##  Curricular units 1st sem (enrolled) Curricular units 1st sem (evaluations)
##  Min.   : 0.000                      Min.   : 0.000                        
##  1st Qu.: 5.000                      1st Qu.: 6.000                        
##  Median : 6.000                      Median : 8.000                        
##  Mean   : 6.271                      Mean   : 8.299                        
##  3rd Qu.: 7.000                      3rd Qu.:10.000                        
##  Max.   :26.000                      Max.   :45.000                        
##  Curricular units 1st sem (approved) Curricular units 1st sem (grade)
##  Min.   : 0.000                      Min.   : 0.00                   
##  1st Qu.: 3.000                      1st Qu.:11.00                   
##  Median : 5.000                      Median :12.29                   
##  Mean   : 4.707                      Mean   :10.64                   
##  3rd Qu.: 6.000                      3rd Qu.:13.40                   
##  Max.   :26.000                      Max.   :18.88                   
##  Curricular units 1st sem (without evaluations)
##  Min.   : 0.0000                               
##  1st Qu.: 0.0000                               
##  Median : 0.0000                               
##  Mean   : 0.1377                               
##  3rd Qu.: 0.0000                               
##  Max.   :12.0000                               
##  Curricular units 2nd sem (credited) Curricular units 2nd sem (enrolled)
##  Min.   : 0.0000                     Min.   : 0.000                     
##  1st Qu.: 0.0000                     1st Qu.: 5.000                     
##  Median : 0.0000                     Median : 6.000                     
##  Mean   : 0.5418                     Mean   : 6.232                     
##  3rd Qu.: 0.0000                     3rd Qu.: 7.000                     
##  Max.   :19.0000                     Max.   :23.000                     
##  Curricular units 2nd sem (evaluations) Curricular units 2nd sem (approved)
##  Min.   : 0.000                         Min.   : 0.000                     
##  1st Qu.: 6.000                         1st Qu.: 2.000                     
##  Median : 8.000                         Median : 5.000                     
##  Mean   : 8.063                         Mean   : 4.436                     
##  3rd Qu.:10.000                         3rd Qu.: 6.000                     
##  Max.   :33.000                         Max.   :20.000                     
##  Curricular units 2nd sem (grade)
##  Min.   : 0.00                   
##  1st Qu.:10.75                   
##  Median :12.20                   
##  Mean   :10.23                   
##  3rd Qu.:13.33                   
##  Max.   :18.57                   
##  Curricular units 2nd sem (without evaluations) Unemployment rate
##  Min.   : 0.0000                                Min.   : 7.60    
##  1st Qu.: 0.0000                                1st Qu.: 9.40    
##  Median : 0.0000                                Median :11.10    
##  Mean   : 0.1503                                Mean   :11.57    
##  3rd Qu.: 0.0000                                3rd Qu.:13.90    
##  Max.   :12.0000                                Max.   :16.20    
##  Inflation rate        GDP               Target          marital_status    
##  Min.   :-0.800   Min.   :-4.060000   Length:4424        Length:4424       
##  1st Qu.: 0.300   1st Qu.:-1.700000   Class :character   Class :character  
##  Median : 1.400   Median : 0.320000   Mode  :character   Mode  :character  
##  Mean   : 1.228   Mean   : 0.001969                                        
##  3rd Qu.: 2.600   3rd Qu.: 1.790000                                        
##  Max.   : 3.700   Max.   : 3.510000                                        
##     gender         
##  Length:4424       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Let’s create 5 subsets from the total population of data from dataset_1.

sample_size<-nrow(dataset_1)%/%2

df1<-sample_n(dataset_1,sample_size,replace=TRUE)
df1
## # A tibble: 2,212 × 39
##    `Marital status` `Application mode` `Application order` Course
##               <dbl>              <dbl>               <dbl>  <dbl>
##  1                1                  1                   1   9238
##  2                1                 17                   1   9670
##  3                1                  1                   4   9773
##  4                1                  1                   2   9147
##  5                1                  1                   3   9238
##  6                1                 43                   1   9991
##  7                2                 39                   1   9556
##  8                1                 17                   2   9119
##  9                1                  1                   4   9500
## 10                1                  1                   1   9085
## # ℹ 2,202 more rows
## # ℹ 35 more variables: `Daytime/evening attendance\t` <dbl>,
## #   `Previous qualification` <dbl>, `Previous qualification (grade)` <dbl>,
## #   Nacionality <dbl>, `Mother's qualification` <dbl>,
## #   `Father's qualification` <dbl>, `Mother's occupation` <dbl>,
## #   `Father's occupation` <dbl>, `Admission grade` <dbl>, Displaced <dbl>,
## #   `Educational special needs` <dbl>, Debtor <dbl>, …
df2<-sample_n(dataset_1,sample_size,replace=TRUE)
df2
## # A tibble: 2,212 × 39
##    `Marital status` `Application mode` `Application order` Course
##               <dbl>              <dbl>               <dbl>  <dbl>
##  1                1                  1                   1   9500
##  2                1                 44                   1   9238
##  3                1                 39                   1   9556
##  4                1                  1                   1    171
##  5                1                  1                   1    171
##  6                1                  1                   1   9147
##  7                1                  1                   5   9500
##  8                1                 17                   2   9556
##  9                1                 17                   2   8014
## 10                1                 39                   1   9119
## # ℹ 2,202 more rows
## # ℹ 35 more variables: `Daytime/evening attendance\t` <dbl>,
## #   `Previous qualification` <dbl>, `Previous qualification (grade)` <dbl>,
## #   Nacionality <dbl>, `Mother's qualification` <dbl>,
## #   `Father's qualification` <dbl>, `Mother's occupation` <dbl>,
## #   `Father's occupation` <dbl>, `Admission grade` <dbl>, Displaced <dbl>,
## #   `Educational special needs` <dbl>, Debtor <dbl>, …
df3<-sample_n(dataset_1,sample_size,replace=TRUE)
df3
## # A tibble: 2,212 × 39
##    `Marital status` `Application mode` `Application order` Course
##               <dbl>              <dbl>               <dbl>  <dbl>
##  1                2                 39                   1   8014
##  2                1                 39                   1   9991
##  3                1                 18                   1   9500
##  4                1                  1                   5   9500
##  5                1                 17                   2   9500
##  6                1                 18                   1   9500
##  7                1                 39                   1   9130
##  8                1                 17                   3   9147
##  9                1                  1                   1   9238
## 10                1                 44                   1   9003
## # ℹ 2,202 more rows
## # ℹ 35 more variables: `Daytime/evening attendance\t` <dbl>,
## #   `Previous qualification` <dbl>, `Previous qualification (grade)` <dbl>,
## #   Nacionality <dbl>, `Mother's qualification` <dbl>,
## #   `Father's qualification` <dbl>, `Mother's occupation` <dbl>,
## #   `Father's occupation` <dbl>, `Admission grade` <dbl>, Displaced <dbl>,
## #   `Educational special needs` <dbl>, Debtor <dbl>, …
df4<-sample_n(dataset_1,sample_size,replace=TRUE)
df4
## # A tibble: 2,212 × 39
##    `Marital status` `Application mode` `Application order` Course
##               <dbl>              <dbl>               <dbl>  <dbl>
##  1                1                  1                   3   9773
##  2                1                  1                   1   9500
##  3                1                 43                   1   9991
##  4                1                 44                   1   9003
##  5                1                 39                   1   9670
##  6                1                  1                   1   9500
##  7                1                  7                   1   9500
##  8                1                 43                   1   9500
##  9                1                  1                   5   9070
## 10                1                 39                   1   9500
## # ℹ 2,202 more rows
## # ℹ 35 more variables: `Daytime/evening attendance\t` <dbl>,
## #   `Previous qualification` <dbl>, `Previous qualification (grade)` <dbl>,
## #   Nacionality <dbl>, `Mother's qualification` <dbl>,
## #   `Father's qualification` <dbl>, `Mother's occupation` <dbl>,
## #   `Father's occupation` <dbl>, `Admission grade` <dbl>, Displaced <dbl>,
## #   `Educational special needs` <dbl>, Debtor <dbl>, …
df5<-sample_n(dataset_1,sample_size,replace=TRUE)
df5
## # A tibble: 2,212 × 39
##    `Marital status` `Application mode` `Application order` Course
##               <dbl>              <dbl>               <dbl>  <dbl>
##  1                1                 39                   1   9238
##  2                1                  1                   2   9773
##  3                1                 43                   1   9238
##  4                1                  1                   1   9500
##  5                1                 39                   1   9003
##  6                1                 17                   4   9500
##  7                1                 17                   4   9147
##  8                1                 44                   1    171
##  9                1                 17                   1   9254
## 10                1                  1                   1   9254
## # ℹ 2,202 more rows
## # ℹ 35 more variables: `Daytime/evening attendance\t` <dbl>,
## #   `Previous qualification` <dbl>, `Previous qualification (grade)` <dbl>,
## #   Nacionality <dbl>, `Mother's qualification` <dbl>,
## #   `Father's qualification` <dbl>, `Mother's occupation` <dbl>,
## #   `Father's occupation` <dbl>, `Admission grade` <dbl>, Displaced <dbl>,
## #   `Educational special needs` <dbl>, Debtor <dbl>, …

All five sub samples have equal number of records, i.e.,2212 observations with 40 variables in each respectively.

dataset_1|>
  group_by(gender)|>
  summarise(count=n(),
            mean_1stsemGrade=mean(`Curricular units 1st sem (grade)`),
            mean_AdmGrade=mean(`Admission grade`)
            )
## # A tibble: 2 × 4
##   gender count mean_1stsemGrade mean_AdmGrade
##   <chr>  <int>            <dbl>         <dbl>
## 1 female  2868            11.3           127.
## 2 male    1556             9.40          127.
df1|>
  group_by(gender)|>
  summarise(
    count = n(),
    mean_1stsemGrade = mean(`Curricular units 1st sem (grade)`,),
    mean_AdmGrade = mean(`Admission grade`)
          )
## # A tibble: 2 × 4
##   gender count mean_1stsemGrade mean_AdmGrade
##   <chr>  <int>            <dbl>         <dbl>
## 1 female  1428            11.2           128.
## 2 male     784             9.53          128.
df2|>
  group_by(gender)|>
  summarise(
    count = n(),
    mean_1stsemGrade = mean(`Curricular units 1st sem (grade)`,),
    mean_AdmGrade = mean(`Admission grade`)
          )
## # A tibble: 2 × 4
##   gender count mean_1stsemGrade mean_AdmGrade
##   <chr>  <int>            <dbl>         <dbl>
## 1 female  1443            11.2           127.
## 2 male     769             9.55          128.
df3|>
  group_by(gender)|>
  summarise(
    count = n(),
    mean_1stsemGrade = mean(`Curricular units 1st sem (grade)`,),
    mean_AdmGrade = mean(`Admission grade`)
          )
## # A tibble: 2 × 4
##   gender count mean_1stsemGrade mean_AdmGrade
##   <chr>  <int>            <dbl>         <dbl>
## 1 female  1404            11.3           127.
## 2 male     808             9.63          128.
df4|>
  group_by(gender)|>
  summarise(
    count = n(),
    mean_1stsemGrade = mean(`Curricular units 1st sem (grade)`,),
    mean_AdmGrade = mean(`Admission grade`)
          )
## # A tibble: 2 × 4
##   gender count mean_1stsemGrade mean_AdmGrade
##   <chr>  <int>            <dbl>         <dbl>
## 1 female  1425             11.4          128.
## 2 male     787             10.0          128.
df5|>
  group_by(gender)|>
  summarise(
    count = n(),
    mean_1stsemGrade = mean(`Curricular units 1st sem (grade)`),
    mean_AdmGrade = mean(`Admission grade`)
          )
## # A tibble: 2 × 4
##   gender count mean_1stsemGrade mean_AdmGrade
##   <chr>  <int>            <dbl>         <dbl>
## 1 female  1461            11.4           127.
## 2 male     751             9.55          128.

This strongly gives us confidence that using analytic metrics on a random sample of the population, we can draw insights which will equally apply to the entire population of the data set.

num_simulations <- 1
simulated_means <- numeric(num_simulations)
simulated_means[1] <- mean(df1$`Curricular units 1st sem (grade)`, na.rm = TRUE)
simulated_means[2] <- mean(df2$`Curricular units 1st sem (grade)`, na.rm = TRUE)
simulated_means[3] <- mean(df3$`Curricular units 1st sem (grade)`, na.rm = TRUE)
simulated_means[4] <- mean(df4$`Curricular units 1st sem (grade)`, na.rm = TRUE)
simulated_means[5] <- mean(df5$`Curricular units 1st sem (grade)`, na.rm = TRUE)

# Visualizing the distribution of simulated means
ggplot(data.frame(simulated_means), aes(x = simulated_means)) +
  geom_histogram(bins = 30, fill = "blue", alpha = 0.8) +
  labs(title = "Distribution of Simulated Means", x = "Mean Score", y = "Frequency")

simulated_means
## [1] 10.63321 10.62437 10.67854 10.92841 10.74548
sd(simulated_means)
## [1] 0.1249628
mean(simulated_means)
## [1] 10.722