library(readxl)
library(ggplot2) 
library(pastecs)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks pastecs::extract()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::first()   masks pastecs::first()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ dplyr::last()    masks pastecs::last()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load dataset
district <- read_excel("~/Desktop/data selection/district.xls")
# View first few rows
head(district)
## # A tibble: 6 × 137
##   DISTNAME DISTRICT DZCNTYNM REGION DZRATING DZCAMPUS DPETALLC DPETBLAP DPETHISP
##   <chr>    <chr>    <chr>    <chr>  <chr>       <dbl>    <dbl>    <dbl>    <dbl>
## 1 CAYUGA … 001902   001 AND… 07     A               3      574      4.4     11.5
## 2 ELKHART… 001903   001 AND… 07     A               4     1150      4       11.8
## 3 FRANKST… 001904   001 AND… 07     A               3      808      8.5     11.3
## 4 NECHES … 001906   001 AND… 07     A               2      342      8.2     13.5
## 5 PALESTI… 001907   001 AND… 07     B               6     3360     25.1     42.9
## 6 WESTWOO… 001908   001 AND… 07     B               4     1332     19.7     26.2
## # ℹ 128 more variables: DPETWHIP <dbl>, DPETINDP <dbl>, DPETASIP <dbl>,
## #   DPETPCIP <dbl>, DPETTWOP <dbl>, DPETECOP <dbl>, DPETLEPP <dbl>,
## #   DPETSPEP <dbl>, DPETBILP <dbl>, DPETVOCP <dbl>, DPETGIFP <dbl>,
## #   DA0AT21R <dbl>, DA0912DR21R <dbl>, DAGC4X21R <dbl>, DAGC5X20R <dbl>,
## #   DAGC6X19R <dbl>, DA0GR21N <dbl>, DA0GS21N <dbl>, DDA00A001S22R <dbl>,
## #   DDA00A001222R <dbl>, DDA00A001322R <dbl>, DDA00AR01S22R <dbl>,
## #   DDA00AR01222R <dbl>, DDA00AR01322R <dbl>, DDA00AM01S22R <dbl>, …

Testing Correlations with cor ()

# I am going to start early and try easy variables like DPETALLC (Total Students) and DZCAMPUS (Number of Schools)!

cor_data <- district %>%select(DPETALLC, DZCAMPUS, DPETBLAP, DPETHISP)

correlation_matrix <- cor(cor_data, use="complete.obs")

correlation_matrix
##            DPETALLC   DZCAMPUS    DPETBLAP   DPETHISP
## DPETALLC 1.00000000 0.96553240  0.09809322  0.1253979
## DZCAMPUS 0.96553240 1.00000000  0.09098361  0.1507146
## DPETBLAP 0.09809322 0.09098361  1.00000000 -0.1847777
## DPETHISP 0.12539793 0.15071455 -0.18477766  1.0000000

Using the same variables, I am going to try to use the paris() command

pairs(cor_data, main="Pairwise Scatterplots of Selected Variables")

looking at Pearson, Spearman, or Kendall correlations

# Pearson
pearson_result <- cor.test(district$DPETALLC, district$DZCAMPUS, method="pearson")
pearson_result
## 
##  Pearson's product-moment correlation
## 
## data:  district$DPETALLC and district$DZCAMPUS
## t = 128.77, df = 1205, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9614895 0.9691576
## sample estimates:
##       cor 
## 0.9655324

Normality?

# normality (visual)
hist(district$DPETALLC,probability = T)
lines(density(district$DPETALLC),col="red",lwd=3)

hist(district$DZCAMPUS,probability = T)
lines(density(district$DZCAMPUS),col="red",lwd=3)

This looks normal? There is so many entries that its hard to see the normality visually

shapiro.test(district$DPETALLC)
## 
##  Shapiro-Wilk normality test
## 
## data:  district$DPETALLC
## W = 0.35045, p-value < 2.2e-16
shapiro.test(district$DZCAMPUS)
## 
##  Shapiro-Wilk normality test
## 
## data:  district$DZCAMPUS
## W = 0.35504, p-value < 2.2e-16

both variables (DPETALLC and DZCAMPUS) do not follow a normal distribution based on the shapiro-wilk test, so try the other correlation test

Spearman and Kendall test

#  Spearman 
spearman_result <- cor.test(district$DPETALLC, district$DZCAMPUS, method="spearman")
## Warning in cor.test.default(district$DPETALLC, district$DZCAMPUS, method =
## "spearman"): Cannot compute exact p-value with ties
spearman_result
## 
##  Spearman's rank correlation rho
## 
## data:  district$DPETALLC and district$DZCAMPUS
## S = 27993507, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.9044816
#  Kendall 
kendall_result <- cor.test(district$DPETALLC, district$DZCAMPUS, method="kendall")
kendall_result
## 
##  Kendall's rank correlation tau
## 
## data:  district$DPETALLC and district$DZCAMPUS
## z = 38.628, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##       tau 
## 0.7879205

In this case, there is a strong positive correlation between the total number of students and the number of schools in a district, meaning that districts with more students tend to have more schools. This relationship is statistically significant with a p-value < 2.2e-16. The Kendall method was appropriate for measuring the correlation since it does not assume a linear relationship or normal distribution of the data.

This result reinforces that the more students a district has, the higher the number of schools, and the correlation is highly significant.