R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, Pstudent_dropout, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpubr)
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## This is mgcv 1.8-42. For overview type 'help("mgcv-package")'.
library(pwr)
# Read dataset 
setwd("/Users/saitejaravulapalli/Documents/IUPUI_SEM 01/Intro to Statistic in R/DATA SET")
student_dropout <- read.csv("student dropout.csv" , sep= ";", header = TRUE)

#Null Hypothesis 1 The unemployment rate does not affect student graduation rates.

Alpha level: 0.05 Power: 0.8 Minimum effect size: 5 percentage point difference in graduation rates between high and low unemployment groups

#Hypothesis 1

unemp_med <- median(student_dropout$Unemployment.rate)
student_dropout$unemp_high <- ifelse(student_dropout$Unemployment.rate > unemp_med, "High", "Low") 

fisher.test(table(student_dropout$Target, student_dropout$unemp_high))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(student_dropout$Target, student_dropout$unemp_high)
## p-value = 0.04427
## alternative hypothesis: two.sided
ggplot(student_dropout, aes(x=Unemployment.rate, y=as.factor(Target),color=as.factor(Target))) +
  geom_jitter(alpha = 0.5) +
  geom_smooth(method="gam", formula = y ~ s(x))
## Warning: Computation failed in `stat_smooth()`
## Caused by error in `gam.reparam()`:
## ! NA/NaN/Inf in foreign function call (arg 3)

#Null Hypothesis 2

There is no difference in grades based on student gender.

Alpha: 0.01 Power: 0.9 Minimum effect size: 0.2 standard deviations between groups

#Hypothesis 2
tab <- table(student_dropout$Gender,student_dropout$Target)

result1 <- chisq.test(student_dropout$Gender, student_dropout$Target)
print(result1)
## 
##  Pearson's Chi-squared test
## 
## data:  student_dropout$Gender and student_dropout$Target
## X-squared = 233.27, df = 2, p-value < 2.2e-16
result2 <- fisher.test(student_dropout$Gender,student_dropout$Target)
print(result2)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  student_dropout$Gender and student_dropout$Target
## p-value < 2.2e-16
## alternative hypothesis: two.sided
mosaicplot(tab, main="Mosaic Plot of Gender vs. Target")