library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readxl)
library(pastecs)
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
# Load dataset
district <- read_excel("district.xls")
# View first few rows
head(district)
## # A tibble: 6 × 137
## DISTNAME DISTRICT DZCNTYNM REGION DZRATING DZCAMPUS DPETALLC DPETBLAP DPETHISP
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 CAYUGA … 001902 001 AND… 07 A 3 574 4.4 11.5
## 2 ELKHART… 001903 001 AND… 07 A 4 1150 4 11.8
## 3 FRANKST… 001904 001 AND… 07 A 3 808 8.5 11.3
## 4 NECHES … 001906 001 AND… 07 A 2 342 8.2 13.5
## 5 PALESTI… 001907 001 AND… 07 B 6 3360 25.1 42.9
## 6 WESTWOO… 001908 001 AND… 07 B 4 1332 19.7 26.2
## # ℹ 128 more variables: DPETWHIP <dbl>, DPETINDP <dbl>, DPETASIP <dbl>,
## # DPETPCIP <dbl>, DPETTWOP <dbl>, DPETECOP <dbl>, DPETLEPP <dbl>,
## # DPETSPEP <dbl>, DPETBILP <dbl>, DPETVOCP <dbl>, DPETGIFP <dbl>,
## # DA0AT21R <dbl>, DA0912DR21R <dbl>, DAGC4X21R <dbl>, DAGC5X20R <dbl>,
## # DAGC6X19R <dbl>, DA0GR21N <dbl>, DA0GS21N <dbl>, DDA00A001S22R <dbl>,
## # DDA00A001222R <dbl>, DDA00A001322R <dbl>, DDA00AR01S22R <dbl>,
## # DDA00AR01222R <dbl>, DDA00AR01322R <dbl>, DDA00AM01S22R <dbl>, …
# Within the District dataset, I am particatly interested in is the TOTAL ACTUAL OPERATING EXPENDITURES PER PUPIL.
MainVariable <- district$DPFEAOPFK
stat.desc(MainVariable)
## nbr.val nbr.null nbr.na min max range
## 1.202000e+03 0.000000e+00 5.000000e+00 6.755000e+03 1.784670e+05 1.717120e+05
## sum median mean SE.mean CI.mean.0.95 var
## 1.577175e+07 1.222850e+04 1.312126e+04 1.790370e+02 3.512600e+02 3.852919e+07
## std.dev coef.var
## 6.207189e+03 4.730635e-01
district_clean <- district %>% filter(!is.na(DPFEAOPFK))
ggplot(district_clean, aes(x = DPFEAOPFK)) +
geom_histogram(binwidth = 5, fill = 'blue', color = 'black') +
labs(title = "Histogram of Expenditures", x = "Dollars")
# I am going to both the Log and Sqrt to see which one would make more sense to help make the data seem more normal
district_clean_log <- district_clean %>% mutate(log_expenditure = log(DPFEAOPFK))
district_clean_sqrt <- district_clean %>% mutate(sqrt_expenditure = sqrt(DPFEAOPFK))
ggplot(district_clean_log, aes(x = log_expenditure)) +
geom_histogram(binwidth = 0.2, fill = 'green', color = 'black') +
labs(title = "Histogram of Log Transformed Expenditure", x = "log_expenditure")
ggplot(district_clean_sqrt, aes(x = sqrt_expenditure)) +
geom_histogram(binwidth = 0.2, fill = 'green', color = 'black') +
labs(title = "Histogram of Sqrt Transformed Expenditure", x = "sqrt_expenditure")
The log-transformed variable shows a more normal distribution compared to the original data, which had a left skew.