library(readxl)
library(ggplot2)
library(pastecs)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks pastecs::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks pastecs::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks pastecs::last()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load dataset
district <- read_excel("~/Desktop/data selection/district.xls")
# View first few rows
head(district)
## # A tibble: 6 × 137
## DISTNAME DISTRICT DZCNTYNM REGION DZRATING DZCAMPUS DPETALLC DPETBLAP DPETHISP
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 CAYUGA … 001902 001 AND… 07 A 3 574 4.4 11.5
## 2 ELKHART… 001903 001 AND… 07 A 4 1150 4 11.8
## 3 FRANKST… 001904 001 AND… 07 A 3 808 8.5 11.3
## 4 NECHES … 001906 001 AND… 07 A 2 342 8.2 13.5
## 5 PALESTI… 001907 001 AND… 07 B 6 3360 25.1 42.9
## 6 WESTWOO… 001908 001 AND… 07 B 4 1332 19.7 26.2
## # ℹ 128 more variables: DPETWHIP <dbl>, DPETINDP <dbl>, DPETASIP <dbl>,
## # DPETPCIP <dbl>, DPETTWOP <dbl>, DPETECOP <dbl>, DPETLEPP <dbl>,
## # DPETSPEP <dbl>, DPETBILP <dbl>, DPETVOCP <dbl>, DPETGIFP <dbl>,
## # DA0AT21R <dbl>, DA0912DR21R <dbl>, DAGC4X21R <dbl>, DAGC5X20R <dbl>,
## # DAGC6X19R <dbl>, DA0GR21N <dbl>, DA0GS21N <dbl>, DDA00A001S22R <dbl>,
## # DDA00A001222R <dbl>, DDA00A001322R <dbl>, DDA00AR01S22R <dbl>,
## # DDA00AR01222R <dbl>, DDA00AR01322R <dbl>, DDA00AM01S22R <dbl>, …
# I am going to start early and try easy variables like DPETALLC (Total Students) and DZCAMPUS (Number of Schools)!
cor_data <- district %>%select(DPETALLC, DZCAMPUS, DPETBLAP, DPETHISP)
correlation_matrix <- cor(cor_data, use="complete.obs")
correlation_matrix
## DPETALLC DZCAMPUS DPETBLAP DPETHISP
## DPETALLC 1.00000000 0.96553240 0.09809322 0.1253979
## DZCAMPUS 0.96553240 1.00000000 0.09098361 0.1507146
## DPETBLAP 0.09809322 0.09098361 1.00000000 -0.1847777
## DPETHISP 0.12539793 0.15071455 -0.18477766 1.0000000
pairs(cor_data, main="Pairwise Scatterplots of Selected Variables")
# Pearson
pearson_result <- cor.test(district$DPETALLC, district$DZCAMPUS, method="pearson")
pearson_result
##
## Pearson's product-moment correlation
##
## data: district$DPETALLC and district$DZCAMPUS
## t = 128.77, df = 1205, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9614895 0.9691576
## sample estimates:
## cor
## 0.9655324
# normality (visual)
hist(district$DPETALLC,probability = T)
lines(density(district$DPETALLC),col="red",lwd=3)
hist(district$DZCAMPUS,probability = T)
lines(density(district$DZCAMPUS),col="red",lwd=3)
This looks normal? There is so many entries that its hard to see the
normality visually
shapiro.test(district$DPETALLC)
##
## Shapiro-Wilk normality test
##
## data: district$DPETALLC
## W = 0.35045, p-value < 2.2e-16
shapiro.test(district$DZCAMPUS)
##
## Shapiro-Wilk normality test
##
## data: district$DZCAMPUS
## W = 0.35504, p-value < 2.2e-16
both variables (DPETALLC and DZCAMPUS) do not follow a normal distribution based on the shapiro-wilk test, so try the other correlation test
# Spearman
spearman_result <- cor.test(district$DPETALLC, district$DZCAMPUS, method="spearman")
## Warning in cor.test.default(district$DPETALLC, district$DZCAMPUS, method =
## "spearman"): Cannot compute exact p-value with ties
spearman_result
##
## Spearman's rank correlation rho
##
## data: district$DPETALLC and district$DZCAMPUS
## S = 27993507, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9044816
# Kendall
kendall_result <- cor.test(district$DPETALLC, district$DZCAMPUS, method="kendall")
kendall_result
##
## Kendall's rank correlation tau
##
## data: district$DPETALLC and district$DZCAMPUS
## z = 38.628, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.7879205
In this case, there is a strong positive correlation between the total number of students and the number of schools in a district, meaning that districts with more students tend to have more schools. This relationship is statistically significant with a p-value < 2.2e-16. The Kendall method was appropriate for measuring the correlation since it does not assume a linear relationship or normal distribution of the data.
This result reinforces that the more students a district has, the higher the number of schools, and the correlation is highly significant.