library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
district_3_<-read_excel("district (3).xls")
pastecs::stat.desc(district_3_$DA0AT21R,norm=T)
##       nbr.val      nbr.null        nbr.na           min           max 
##  1.203000e+03  0.000000e+00  4.000000e+00 -1.000000e+00  1.000000e+02 
##         range           sum        median          mean       SE.mean 
##  1.010000e+02  1.139929e+05  9.540000e+01  9.475719e+01  1.402861e-01 
##  CI.mean.0.95           var       std.dev      coef.var      skewness 
##  2.752329e-01  2.367528e+01  4.865725e+00  5.134940e-02 -1.349334e+01 
##      skew.2SE      kurtosis      kurt.2SE    normtest.W    normtest.p 
## -9.565050e+01  2.503308e+02  8.879953e+02  3.600778e-01  9.455757e-54

#Mean: 9.475719e+01

#Median: 9.540000e+01

#Skewed to the left

qqnorm(district_3_$DA0AT21R)
qqline(district_3_$DA0AT21R,col="red")

shapiro.test(district_3_$DA0AT21R)
## 
##  Shapiro-Wilk normality test
## 
## data:  district_3_$DA0AT21R
## W = 0.36008, p-value < 2.2e-16

P-value is greater than .05 so data is plausibly normal.

qqnorm(district_3_$DA0AT21R)
qqline(district_3_$DA0AT21R,col="red")

ks.test(district_3_$DA0AT21R,"pnorm",mean=mean(district_3_$DA0AT21R),sd=sd(district_3_$DA0AT21R))
## Warning in ks.test.default(district_3_$DA0AT21R, "pnorm", mean =
## mean(district_3_$DA0AT21R), : ties should not be present for the one-sample
## Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  district_3_$DA0AT21R
## D = NA, p-value = NA
## alternative hypothesis: two-sided
district_3_cleaned<-district_3_%>%filter(DA0AT21R>0)
ks.test(district_3_$DA0AT21R,"pnorm",mean=mean(district_3_$DA0AT21R),sd=sd(district_3_$DA0AT21R))
## Warning in ks.test.default(district_3_$DA0AT21R, "pnorm", mean =
## mean(district_3_$DA0AT21R), : ties should not be present for the one-sample
## Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  district_3_$DA0AT21R
## D = NA, p-value = NA
## alternative hypothesis: two-sided
district_3_cleaned<-district_3_cleaned%>%filter(DA0AT21R>0)
mean(district_3_cleaned$DA0AT21R)
## [1] 94.91665
hist(district_3_cleaned$DA0AT21R, breaks = 10,probability = T)
lines(density(district_3_cleaned$DA0AT21R),col='red',lwd=2)

qqnorm(district_3_cleaned$DA0AT21R)
qqline(district_3_cleaned$DA0AT21R,col='red')

district_3_cleanedsqrt<-district_3_cleaned%>%mutate(DA0AT21R_SQRT=sqrt(DA0AT21R))%>%select(DA0AT21R,DA0AT21R_SQRT)

head(district_3_cleaned)
## # A tibble: 6 × 137
##   DISTNAME DISTRICT DZCNTYNM REGION DZRATING DZCAMPUS DPETALLC DPETBLAP DPETHISP
##   <chr>    <chr>    <chr>    <chr>  <chr>       <dbl>    <dbl>    <dbl>    <dbl>
## 1 CAYUGA … 001902   001 AND… 07     A               3      574      4.4     11.5
## 2 ELKHART… 001903   001 AND… 07     A               4     1150      4       11.8
## 3 FRANKST… 001904   001 AND… 07     A               3      808      8.5     11.3
## 4 NECHES … 001906   001 AND… 07     A               2      342      8.2     13.5
## 5 PALESTI… 001907   001 AND… 07     B               6     3360     25.1     42.9
## 6 WESTWOO… 001908   001 AND… 07     B               4     1332     19.7     26.2
## # ℹ 128 more variables: DPETWHIP <dbl>, DPETINDP <dbl>, DPETASIP <dbl>,
## #   DPETPCIP <dbl>, DPETTWOP <dbl>, DPETECOP <dbl>, DPETLEPP <dbl>,
## #   DPETSPEP <dbl>, DPETBILP <dbl>, DPETVOCP <dbl>, DPETGIFP <dbl>,
## #   DA0AT21R <dbl>, DA0912DR21R <dbl>, DAGC4X21R <dbl>, DAGC5X20R <dbl>,
## #   DAGC6X19R <dbl>, DA0GR21N <dbl>, DA0GS21N <dbl>, DDA00A001S22R <dbl>,
## #   DDA00A001222R <dbl>, DDA00A001322R <dbl>, DDA00AR01S22R <dbl>,
## #   DDA00AR01222R <dbl>, DDA00AR01322R <dbl>, DDA00AM01S22R <dbl>, …
library(ggplot2)
ggplot(district_3_cleaned,aes(x=log(DA0AT21R),y=DPETECOP)) + geom_point() + ggtitle("Log-Transformed Data")