This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(ggplot2)
library(ggpubr)
library(modelr)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(broom)
##
## Attaching package: 'broom'
##
## The following object is masked from 'package:modelr':
##
## bootstrap
set.seed(123)
setwd("/Users/saitejaravulapalli/Documents/IUPUI_SEM 01/Intro to Statistic in R/DATA SET")
data <- read.csv("student dropout.csv" , sep= ";", header = TRUE)
str(data)
## 'data.frame': 4424 obs. of 37 variables:
## $ Marital.status : int 1 1 1 1 2 2 1 1 1 1 ...
## $ Application.mode : int 17 15 1 17 39 39 1 18 1 1 ...
## $ Application.order : int 5 1 5 2 1 1 1 4 3 1 ...
## $ Course : int 171 9254 9070 9773 8014 9991 9500 9254 9238 9238 ...
## $ Daytime.evening.attendance. : int 1 1 1 1 0 0 1 1 1 1 ...
## $ Previous.qualification : int 1 1 1 1 1 19 1 1 1 1 ...
## $ Previous.qualification..grade. : num 122 160 122 122 100 ...
## $ Nacionality : int 1 1 1 1 1 1 1 1 62 1 ...
## $ Mother.s.qualification : int 19 1 37 38 37 37 19 37 1 1 ...
## $ Father.s.qualification : int 12 3 37 37 38 37 38 37 1 19 ...
## $ Mother.s.occupation : int 5 3 9 5 9 9 7 9 9 4 ...
## $ Father.s.occupation : int 9 3 9 3 9 7 10 9 9 7 ...
## $ Admission.grade : num 127 142 125 120 142 ...
## $ Displaced : int 1 1 1 1 0 0 1 1 0 1 ...
## $ Educational.special.needs : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Debtor : int 0 0 0 0 0 1 0 0 0 1 ...
## $ Tuition.fees.up.to.date : int 1 0 0 1 1 1 1 0 1 0 ...
## $ Gender : int 1 1 1 0 0 1 0 1 0 0 ...
## $ Scholarship.holder : int 0 0 0 0 0 0 1 0 1 0 ...
## $ Age.at.enrollment : int 20 19 19 20 45 50 18 22 21 18 ...
## $ International : int 0 0 0 0 0 0 0 0 1 0 ...
## $ Curricular.units.1st.sem..credited. : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Curricular.units.1st.sem..enrolled. : int 0 6 6 6 6 5 7 5 6 6 ...
## $ Curricular.units.1st.sem..evaluations. : int 0 6 0 8 9 10 9 5 8 9 ...
## $ Curricular.units.1st.sem..approved. : int 0 6 0 6 5 5 7 0 6 5 ...
## $ Curricular.units.1st.sem..grade. : num 0 14 0 13.4 12.3 ...
## $ Curricular.units.1st.sem..without.evaluations.: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Curricular.units.2nd.sem..credited. : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Curricular.units.2nd.sem..enrolled. : int 0 6 6 6 6 5 8 5 6 6 ...
## $ Curricular.units.2nd.sem..evaluations. : int 0 6 0 10 6 17 8 5 7 14 ...
## $ Curricular.units.2nd.sem..approved. : int 0 6 0 5 6 5 8 0 6 2 ...
## $ Curricular.units.2nd.sem..grade. : num 0 13.7 0 12.4 13 ...
## $ Curricular.units.2nd.sem..without.evaluations.: int 0 0 0 0 0 5 0 0 0 0 ...
## $ Unemployment.rate : num 10.8 13.9 10.8 9.4 13.9 16.2 15.5 15.5 16.2 8.9 ...
## $ Inflation.rate : num 1.4 -0.3 1.4 -0.8 -0.3 0.3 2.8 2.8 0.3 1.4 ...
## $ GDP : num 1.74 0.79 1.74 -3.12 0.79 -0.92 -4.06 -4.06 -0.92 3.51 ...
## $ Target : chr "Dropout" "Graduate" "Dropout" "Graduate" ...
# Convert the binary response variable to 0 (Dropout) and 1 (Graduate)
data$Target <- ifelse(data$Target == "Graduate", 1, 0)
# Build a logistic regression model
model <- glm(Target ~ Age.at.enrollment + Admission.grade, data = data, family = binomial)
# Get a summary of the logistic regression model
summary(model)
##
## Call:
## glm(formula = Target ~ Age.at.enrollment + Admission.grade, family = binomial,
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.949996 0.295953 -3.210 0.00133 **
## Age.at.enrollment -0.056417 0.004567 -12.353 < 2e-16 ***
## Admission.grade 0.017701 0.002193 8.071 6.99e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6133.0 on 4423 degrees of freedom
## Residual deviance: 5889.8 on 4421 degrees of freedom
## AIC: 5895.8
##
## Number of Fisher Scoring iterations: 4
# Calculate confidence intervals for coefficients
confint(model)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -1.53074793 -0.37033358
## Age.at.enrollment -0.06547837 -0.04756894
## Admission.grade 0.01341922 0.02201884
# Create scatter plots
ggplot(data, aes(x = Age.at.enrollment, y = Target)) +
geom_point() +
ggtitle("Scatter Plot: Age at Enrollment vs. Target")
ggplot(data, aes(x = Admission.grade, y = Target)) +
geom_point() +
ggtitle("Scatter Plot: Admission Grade vs. Target")