library(kableExtra)#To create Codebook Table
library(generics)#to avoid conflicts among similar functions
library(tidyverse)#To facilitate work flow
library(tidyr)#To manipulate Data Files
library(cowplot)#To put graphs together
library(readxl)#To read the given excel file
library(ggplot2)#To Visualize data
library(stats)#for conducting regression analyses
library(graphics)#for pairwise correlation graphs
library(GGally)
final_data <- read_excel("Final Data.xlsx")
# Checking the data structure by summarizing the files
summary(final_data)
REGION DISTNAME CATEGORY CHARTER_STATUS
Min. : 1.00 Length:953 Length:953 Length:953
1st Qu.: 6.00 Class :character Class :character Class :character
Median :10.00 Mode :character Mode :character Mode :character
Mean :10.45
3rd Qu.:15.00
Max. :20.00
SCHL_TYP CUM_STUD ISS_STUD PCT
Min. :0.0000 Min. : 0 Min. : 0 Min. : 0.000
1st Qu.:1.0000 1st Qu.: 123 1st Qu.: 0 1st Qu.: 0.000
Median :1.0000 Median : 488 Median : 37 Median : 7.075
Mean :0.8468 Mean : 2805 Mean : 218 Mean : 7.344
3rd Qu.:1.0000 3rd Qu.: 1765 3rd Qu.: 170 3rd Qu.:12.121
Max. :1.0000 Max. :46786 Max. :4293 Max. :32.203
#checking the missing data by variables in the new dataset
summary(is.na(final_data))
REGION DISTNAME CATEGORY CHARTER_STATUS
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:953 FALSE:953 FALSE:953 FALSE:953
SCHL_TYP CUM_STUD ISS_STUD PCT
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:953 FALSE:953 FALSE:953 FALSE:953
#subsetting the data by removing unnecessary variables
final_data <- select(final_data, REGION, DISTNAME, CHARTER_STATUS,CUM_STUD, ISS_STUD, PCT)
str(final_data)
tibble [953 x 6] (S3: tbl_df/tbl/data.frame)
$ REGION : num [1:953] 1 1 1 1 1 1 1 1 1 1 ...
$ DISTNAME : chr [1:953] "BROWNSVILLE ISD" "HARLINGEN CISD" "LA FERIA ISD" "LOS FRESNOS CISD" ...
$ CHARTER_STATUS: chr [1:953] "TRADITIONAL ISD/CSD" "TRADITIONAL ISD/CSD" "TRADITIONAL ISD/CSD" "TRADITIONAL ISD/CSD" ...
$ CUM_STUD : num [1:953] 46786 14 98 56 45925 ...
$ ISS_STUD : num [1:953] 3123 0 0 0 3104 ...
$ PCT : num [1:953] 6.68 0 0 0 6.76 ...
# Changing the variable types to Factor
final_data$REGION <- as.factor(final_data$REGION)
final_data$CHARTER_STATUS <- as.factor(final_data$CHARTER_STATUS)
final_data$DISTNAME <- as.factor(final_data$DISTNAME)
summary(final_data)
REGION DISTNAME CHARTER_STATUS
10 : 89 BIG SANDY ISD : 2 OPEN ENROLLMENT CHARTER:146
7 : 82 DAWSON ISD : 2 TRADITIONAL ISD/CSD :807
11 : 70 EDGEWOOD ISD : 2
4 : 67 HIGHLAND PARK ISD: 2
20 : 67 HUBBARD ISD : 2
13 : 61 NORTHSIDE ISD : 2
(Other):517 (Other) :941
CUM_STUD ISS_STUD PCT
Min. : 0 Min. : 0 Min. : 0.000
1st Qu.: 123 1st Qu.: 0 1st Qu.: 0.000
Median : 488 Median : 37 Median : 7.075
Mean : 2805 Mean : 218 Mean : 7.344
3rd Qu.: 1765 3rd Qu.: 170 3rd Qu.:12.121
Max. :46786 Max. :4293 Max. :32.203
ggplot()+
geom_histogram(data = final_data, aes(final_data$CUM_STUD), fill="blue", color="darkblue")+
geom_vline(xintercept = mean(final_data$CUM_STUD), col = "brown", lwd = 2)
mean(final_data$CUM_STUD)
[1] 2804.507
Now, I want to run a quick pairwise correlation:
pairs(final_data, panel = panel.smooth,
main = "Pairwise Scatter Plot",
col = 3 + (final_data$PCT > 5))
ggpairs(final_data[,-(1:2)],lower=list(continuous=wrap("smooth", method="lm")))
g <- ggplot(final_data, aes(x = CHARTER_STATUS, y = PCT, colour = factor(CHARTER_STATUS)))+
geom_boxplot(outlier.colour="black", outlier.shape = 23, outlier.size = 4, notch = TRUE)+
xlab("Charter Status")+
ylab("Percentage of Disciplinary Action in the Year 2019")
g
fit1 <- lm(PCT ~ CHARTER_STATUS, data = final_data)
summary(fit1)
Call:
lm(formula = PCT ~ CHARTER_STATUS, data = final_data)
Residuals:
Min 1Q Median 3Q Max
-7.6719 -7.6719 -0.4094 4.7090 24.5315
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.5292 0.5834 9.478 < 2e-16 ***
CHARTER_STATUSTRADITIONAL ISD/CSD 2.1428 0.6339 3.380 0.000754 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 7.049 on 951 degrees of freedom
Multiple R-squared: 0.01187, Adjusted R-squared: 0.01083
F-statistic: 11.42 on 1 and 951 DF, p-value: 0.0007543
ggplot(data=final_data, aes(y = ISS_STUD, x = REGION, fill = REGION))+
geom_violin(colour="blue", size=0.5)+
xlab("School Region")+
ylab("Total Disciplinary Actions in 2019")
# Region vs PCT
ggplot(data=final_data, aes(y = PCT, x = REGION, fill = REGION))+
geom_violin(colour = "black", size = 0.5)+
xlab("School Regions")+
ylab("Total % of Disciplinary Actions in 2019")
fit2 <- lm(PCT ~ CHARTER_STATUS + REGION, data = final_data)
summary(fit2)
Call:
lm(formula = PCT ~ CHARTER_STATUS + REGION, data = final_data)
Residuals:
Min 1Q Median 3Q Max
-12.2413 -5.2949 -0.3413 4.4963 25.1409
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.8180 1.2290 3.107 0.001949 **
CHARTER_STATUSTRADITIONAL ISD/CSD 1.9414 0.6477 2.997 0.002796 **
REGION2 4.3224 1.5705 2.752 0.006035 **
REGION3 -1.4012 1.6365 -0.856 0.392101
REGION4 -0.5833 1.3837 -0.422 0.673475
REGION5 1.6004 1.6483 0.971 0.331846
REGION6 -0.4646 1.4550 -0.319 0.749557
REGION7 0.1189 1.3293 0.089 0.928748
REGION8 -2.3699 1.6225 -1.461 0.144457
REGION9 1.1527 1.6512 0.698 0.485269
REGION10 2.9371 1.3147 2.234 0.025716 *
REGION11 1.2976 1.3635 0.952 0.341517
REGION12 0.7215 1.4178 0.509 0.610926
REGION13 1.3030 1.3987 0.932 0.351774
REGION14 5.7109 1.5618 3.657 0.000270 ***
REGION15 6.4818 1.5618 4.150 3.63e-05 ***
REGION16 4.4007 1.4583 3.018 0.002616 **
REGION17 5.4798 1.4761 3.712 0.000218 ***
REGION18 0.8454 1.6804 0.503 0.615016
REGION19 -0.6930 2.0560 -0.337 0.736158
REGION20 4.6563 1.3788 3.377 0.000763 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6.707 on 932 degrees of freedom
Multiple R-squared: 0.1232, Adjusted R-squared: 0.1044
F-statistic: 6.55 on 20 and 932 DF, p-value: < 2.2e-16
anova(fit1,fit2)
Analysis of Variance Table
Model 1: PCT ~ CHARTER_STATUS
Model 2: PCT ~ CHARTER_STATUS + REGION
Res.Df RSS Df Sum of Sq F Pr(>F)
1 951 47252
2 932 41927 19 5325.2 6.2303 2.821e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2,2))
plot(fit2)
Thank You