Summary

Goal:

Importing csv data into data frame:

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
dataset <-read_delim("C:/Users/MSKR/MASTERS_ADS/STATISTICS_SEM1/DATA_SET_1.csv", delim = ",")
## Rows: 4424 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Target
## dbl (36): Marital status, Application mode, Application order, Course, Dayti...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Investigations:

  1. One interesting aspect about the data is it has students’ data from different Marital status and can look into their academic target classification.
dataset_1<-mutate(dataset, marital_status = ifelse(dataset$`Marital status` == 1, "single",
                    ifelse(`Marital status` == 2, "married",
                    ifelse(`Marital status` == 3, "widower",
                    ifelse(`Marital status` == 4, "divorced",
                    ifelse(`Marital status` == 5, "facto union",
                    ifelse(`Marital status` == 6, "legally seperated", "no")))))))
dg<- dataset_1|>
  group_by(marital_status,Target)|>
  summarise(num_std=n())
## `summarise()` has grouped output by 'marital_status'. You can override using
## the `.groups` argument.
dg
## # A tibble: 18 × 3
## # Groups:   marital_status [6]
##    marital_status    Target   num_std
##    <chr>             <chr>      <int>
##  1 divorced          Dropout       42
##  2 divorced          Enrolled      16
##  3 divorced          Graduate      33
##  4 facto union       Dropout       11
##  5 facto union       Enrolled       3
##  6 facto union       Graduate      11
##  7 legally seperated Dropout        4
##  8 legally seperated Enrolled       1
##  9 legally seperated Graduate       1
## 10 married           Dropout      179
## 11 married           Enrolled      52
## 12 married           Graduate     148
## 13 single            Dropout     1184
## 14 single            Enrolled     720
## 15 single            Graduate    2015
## 16 widower           Dropout        1
## 17 widower           Enrolled       2
## 18 widower           Graduate       1
p1<- dg|>
  ggplot(aes(x=marital_status, y=num_std,fill=Target))+
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(title = "Marital status vs Target",
       x = "marital_status",
       y = "Frequency") +
  theme_minimal()
p1

  1. Since there are students from different nationalities and marital status, we can visualize how unemployement rates vary across each category of students.

    p<-dataset_1|>
      ggplot() +
      geom_boxplot(mapping = aes(x = marital_status , y =`Unemployment rate` )) +
     labs(title="Unemployment Rate") +  #labels!
     theme_minimal()
    
    p

Future Plan:

Initial findings:

  1. It seems like students with high Admission grades have high graduation rates:

    p2<- dataset_1|>
      ggplot(aes(x=Target,y=`Admission grade`))+
      geom_bar(stat="identity")+
      labs(title = "Admission grades trend",
           x = "Target",
           y = "Grades") 
    p2

  2. Students attending day classes have better academic performances than those of attending in the evening.

    dataset_1<-mutate(dataset_1, day_eve_class= ifelse(dataset_1$`Daytime/evening attendance    ` == 1, "day","evening"))
    
    dd<- dataset_1|>
      group_by(Target,day_eve_class)|>
      summarise(freq=n())
    ## `summarise()` has grouped output by 'Target'. You can override using the
    ## `.groups` argument.
    dd
    ## # A tibble: 6 × 3
    ## # Groups:   Target [3]
    ##   Target   day_eve_class  freq
    ##   <chr>    <chr>         <int>
    ## 1 Dropout  day            1214
    ## 2 Dropout  evening         207
    ## 3 Enrolled day             719
    ## 4 Enrolled evening          75
    ## 5 Graduate day            2008
    ## 6 Graduate evening         201
    p3<- dd|>
      ggplot(aes(x=day_eve_class,y=freq,fill=Target))+
      geom_bar(stat="identity", position = position_dodge())
    p3