Sample Project 2

Invoking Required Libraries

library(kableExtra)#To create Codebook Table
library(generics)#to avoid conflicts among similar functions
library(tidyverse)#To facilitate work flow
library(tidyr)#To manipulate Data Files
library(cowplot)#To put graphs together  
library(readxl)#To read the given excel file
library(ggplot2)#To Visualize data
library(stats)#for conducting regression analyses
library(graphics)#for pairwise correlation graphs
library(GGally)

Reading the Data Files in R

final_data <- read_excel("Final Data.xlsx")
# Checking the data structure by summarizing the files
summary(final_data)

     REGION        DISTNAME           CATEGORY         CHARTER_STATUS    
 Min.   : 1.00   Length:953         Length:953         Length:953        
 1st Qu.: 6.00   Class :character   Class :character   Class :character  
 Median :10.00   Mode  :character   Mode  :character   Mode  :character  
 Mean   :10.45                                                           
 3rd Qu.:15.00                                                           
 Max.   :20.00                                                           
    SCHL_TYP         CUM_STUD        ISS_STUD         PCT        
 Min.   :0.0000   Min.   :    0   Min.   :   0   Min.   : 0.000  
 1st Qu.:1.0000   1st Qu.:  123   1st Qu.:   0   1st Qu.: 0.000  
 Median :1.0000   Median :  488   Median :  37   Median : 7.075  
 Mean   :0.8468   Mean   : 2805   Mean   : 218   Mean   : 7.344  
 3rd Qu.:1.0000   3rd Qu.: 1765   3rd Qu.: 170   3rd Qu.:12.121  
 Max.   :1.0000   Max.   :46786   Max.   :4293   Max.   :32.203

Checking for the Missing Data

#checking the missing data by variables in the new dataset
summary(is.na(final_data))

   REGION         DISTNAME        CATEGORY       CHARTER_STATUS 
 Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:953       FALSE:953       FALSE:953       FALSE:953      
  SCHL_TYP        CUM_STUD        ISS_STUD          PCT         
 Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:953       FALSE:953       FALSE:953       FALSE:953

#subsetting the data by removing unnecessary variables
final_data <- select(final_data, REGION, DISTNAME, CHARTER_STATUS,CUM_STUD, ISS_STUD, PCT)
str(final_data)

tibble [953 x 6] (S3: tbl_df/tbl/data.frame)
 $ REGION        : num [1:953] 1 1 1 1 1 1 1 1 1 1 ...
 $ DISTNAME      : chr [1:953] "BROWNSVILLE ISD" "HARLINGEN CISD" "LA FERIA ISD" "LOS FRESNOS CISD" ...
 $ CHARTER_STATUS: chr [1:953] "TRADITIONAL ISD/CSD" "TRADITIONAL ISD/CSD" "TRADITIONAL ISD/CSD" "TRADITIONAL ISD/CSD" ...
 $ CUM_STUD      : num [1:953] 46786 14 98 56 45925 ...
 $ ISS_STUD      : num [1:953] 3123 0 0 0 3104 ...
 $ PCT           : num [1:953] 6.68 0 0 0 6.76 ...

# Changing the variable types to Factor
final_data$REGION <- as.factor(final_data$REGION)
final_data$CHARTER_STATUS <- as.factor(final_data$CHARTER_STATUS)
final_data$DISTNAME <- as.factor(final_data$DISTNAME)
summary(final_data)

     REGION                 DISTNAME                   CHARTER_STATUS
 10     : 89   BIG SANDY ISD    :  2   OPEN ENROLLMENT CHARTER:146   
 7      : 82   DAWSON ISD       :  2   TRADITIONAL ISD/CSD    :807   
 11     : 70   EDGEWOOD ISD     :  2                                 
 4      : 67   HIGHLAND PARK ISD:  2                                 
 20     : 67   HUBBARD ISD      :  2                                 
 13     : 61   NORTHSIDE ISD    :  2                                 
 (Other):517   (Other)          :941                                 
    CUM_STUD        ISS_STUD         PCT        
 Min.   :    0   Min.   :   0   Min.   : 0.000  
 1st Qu.:  123   1st Qu.:   0   1st Qu.: 0.000  
 Median :  488   Median :  37   Median : 7.075  
 Mean   : 2805   Mean   : 218   Mean   : 7.344  
 3rd Qu.: 1765   3rd Qu.: 170   3rd Qu.:12.121  
 Max.   :46786   Max.   :4293   Max.   :32.203

First Impression

ggplot()+
  geom_histogram(data = final_data, aes(final_data$CUM_STUD), fill="blue", color="darkblue")+ 
  geom_vline(xintercept = mean(final_data$CUM_STUD), col = "brown", lwd = 2)

mean(final_data$CUM_STUD)

[1] 2804.507

Pairwise Correlation

Now, I want to run a quick pairwise correlation:

pairs(final_data, panel = panel.smooth,
      main = "Pairwise Scatter Plot", 
      col = 3 + (final_data$PCT > 5))

Pairwise Correlation2

ggpairs(final_data[,-(1:2)],lower=list(continuous=wrap("smooth", method="lm")))

Pairwise CHARTER STATUS vs. PCT

g <- ggplot(final_data, aes(x = CHARTER_STATUS, y = PCT, colour = factor(CHARTER_STATUS)))+
  geom_boxplot(outlier.colour="black", outlier.shape = 23, outlier.size = 4, notch = TRUE)+
    xlab("Charter Status")+
  ylab("Percentage of Disciplinary Action in the Year 2019")
g

Linear Regression

fit1 <- lm(PCT ~ CHARTER_STATUS, data = final_data)
summary(fit1)


Call:
lm(formula = PCT ~ CHARTER_STATUS, data = final_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.6719 -7.6719 -0.4094  4.7090 24.5315 

Coefficients:
                                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)                         5.5292     0.5834   9.478  < 2e-16 ***
CHARTER_STATUSTRADITIONAL ISD/CSD   2.1428     0.6339   3.380 0.000754 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 7.049 on 951 degrees of freedom
Multiple R-squared:  0.01187,   Adjusted R-squared:  0.01083 
F-statistic: 11.42 on 1 and 951 DF,  p-value: 0.0007543

Pairwise REGION vs. ISS_STUD/PCT

ggplot(data=final_data, aes(y = ISS_STUD, x = REGION, fill = REGION))+
  geom_violin(colour="blue", size=0.5)+
  xlab("School Region")+
  ylab("Total Disciplinary Actions in 2019")

# Region vs PCT
ggplot(data=final_data, aes(y = PCT, x = REGION, fill = REGION))+
  geom_violin(colour = "black", size = 0.5)+
  xlab("School Regions")+
  ylab("Total % of Disciplinary Actions in 2019")

Adding Region in the Game

fit2 <- lm(PCT ~ CHARTER_STATUS + REGION, data = final_data)
summary(fit2)


Call:
lm(formula = PCT ~ CHARTER_STATUS + REGION, data = final_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-12.2413  -5.2949  -0.3413   4.4963  25.1409 

Coefficients:
                                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)                         3.8180     1.2290   3.107 0.001949 ** 
CHARTER_STATUSTRADITIONAL ISD/CSD   1.9414     0.6477   2.997 0.002796 ** 
REGION2                             4.3224     1.5705   2.752 0.006035 ** 
REGION3                            -1.4012     1.6365  -0.856 0.392101    
REGION4                            -0.5833     1.3837  -0.422 0.673475    
REGION5                             1.6004     1.6483   0.971 0.331846    
REGION6                            -0.4646     1.4550  -0.319 0.749557    
REGION7                             0.1189     1.3293   0.089 0.928748    
REGION8                            -2.3699     1.6225  -1.461 0.144457    
REGION9                             1.1527     1.6512   0.698 0.485269    
REGION10                            2.9371     1.3147   2.234 0.025716 *  
REGION11                            1.2976     1.3635   0.952 0.341517    
REGION12                            0.7215     1.4178   0.509 0.610926    
REGION13                            1.3030     1.3987   0.932 0.351774    
REGION14                            5.7109     1.5618   3.657 0.000270 ***
REGION15                            6.4818     1.5618   4.150 3.63e-05 ***
REGION16                            4.4007     1.4583   3.018 0.002616 ** 
REGION17                            5.4798     1.4761   3.712 0.000218 ***
REGION18                            0.8454     1.6804   0.503 0.615016    
REGION19                           -0.6930     2.0560  -0.337 0.736158    
REGION20                            4.6563     1.3788   3.377 0.000763 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.707 on 932 degrees of freedom
Multiple R-squared:  0.1232,    Adjusted R-squared:  0.1044 
F-statistic:  6.55 on 20 and 932 DF,  p-value: < 2.2e-16

Model Fit

anova(fit1,fit2)

Analysis of Variance Table

Model 1: PCT ~ CHARTER_STATUS
Model 2: PCT ~ CHARTER_STATUS + REGION
  Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
1    951 47252                                  
2    932 41927 19    5325.2 6.2303 2.821e-15 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual Analysis

par(mfrow = c(2,2))
plot(fit2)

The first plot shows that the red line is fairly stable and passes through the origin. I don’t see any absence of model departure.
The Q-Q plot tests the normality of the data. As can be seen, it’s somewhere close to 45 degree suggesting normality.
The scale-location plot the standardized ordinary residuals. The scales seem to be fairly comparable across the subgroups.
The final plot shows the relationship between residuals and leverage. Having the dichotomous variable, we see a pattern but we don’t see higher departure in Cook’s distance.

Thank You

Sample Project 2

Nirmal Ghimire

7/19/2021

Invoking Required Libraries

Reading the Data Files in R

Checking for the Missing Data

First Impression

Pairwise Correlation

Pairwise Correlation2

Pairwise CHARTER STATUS vs. PCT

Linear Regression

Pairwise REGION vs. ISS_STUD/PCT

Adding Region in the Game

Model Fit

Residual Analysis