library(readr)
library(readxl)
Admission_Predict <- read_csv("C:/Users/USER/Desktop/data_science_portfolio/Admission_Predict.csv")
## Rows: 400 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): Serial No., GRE Score, TOEFL Score, University Rating, SOP, LOR, CG...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(stringr)
names(Admission_Predict)<-str_to_title(str_to_lower(str_replace_all(names(Admission_Predict),"\\s","_")))
head(Admission_Predict)
## # A tibble: 6 × 9
## Serial_no. Gre_score Toefl_score University_rating Sop Lor Cgpa Research
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 337 118 4 4.5 4.5 9.65 1
## 2 2 324 107 4 4 4.5 8.87 1
## 3 3 316 104 3 3 3.5 8 1
## 4 4 322 110 3 3.5 2.5 8.67 1
## 5 5 314 103 2 2 3 8.21 0
## 6 6 330 115 5 4.5 3 9.34 1
## # ℹ 1 more variable: Chance_of_admit <dbl>
names(Admission_Predict)
## [1] "Serial_no." "Gre_score" "Toefl_score"
## [4] "University_rating" "Sop" "Lor"
## [7] "Cgpa" "Research" "Chance_of_admit"
#logistic regression
g<-glm(Chance_of_admit~Sop+Gre_score+Lor,data=Admission_Predict,family="binomial")
summary(g)
##
## Call:
## glm(formula = Chance_of_admit ~ Sop + Gre_score + Lor, family = "binomial",
## data = Admission_Predict)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -65.30172 7.65939 -8.526 < 2e-16 ***
## Sop 0.99611 0.26188 3.804 0.000143 ***
## Gre_score 0.18664 0.02382 7.836 4.64e-15 ***
## Lor 0.65551 0.26487 2.475 0.013329 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 550.51 on 399 degrees of freedom
## Residual deviance: 251.74 on 396 degrees of freedom
## AIC: 259.74
##
## Number of Fisher Scoring iterations: 6
library(ggplot2)
ggplot(Admission_Predict,aes(x=Cgpa,y=Chance_of_admit))+geom_jitter(height = 0.5,alpha=.1)+geom_smooth(method = "glm",method.args=list(family="binomial"),se=FALSE)+theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
sum<-Admission_Predict%>%group_by(Cgpa)%>%summarise(prop=mean(Chance_of_admit),count=n())
sum
## # A tibble: 168 × 3
## Cgpa prop count
## <dbl> <dbl> <int>
## 1 6.8 0 1
## 2 7.2 0 1
## 3 7.25 0 1
## 4 7.28 0 1
## 5 7.3 0 1
## 6 7.34 0 2
## 7 7.36 0 1
## 8 7.4 0 1
## 9 7.43 0 2
## 10 7.46 0 3
## # ℹ 158 more rows
ggplot(sum,aes(x=Cgpa,y=prop))+geom_point()
model2<-glm(prop~Cgpa,data=sum,family="binomial",weights=count)
summary(model2)
##
## Call:
## glm(formula = prop ~ Cgpa, family = "binomial", data = sum, weights = count)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -52.4698 5.3325 -9.840 <2e-16 ***
## Cgpa 6.0403 0.6149 9.824 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 397.334 on 167 degrees of freedom
## Residual deviance: 84.681 on 166 degrees of freedom
## AIC: 143.84
##
## Number of Fisher Scoring iterations: 6
ggplot(sum,aes(x=Cgpa,y=prop))+geom_point()+geom_smooth(method="glm",se=F,method.args=list(family="binomial"))
## `geom_smooth()` using formula = 'y ~ x'
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!