Load Libraries/Dataset

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readxl)
library(pastecs)
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
# Load dataset
district <- read_excel("district.xls")
# View first few rows
head(district)
## # A tibble: 6 × 137
##   DISTNAME DISTRICT DZCNTYNM REGION DZRATING DZCAMPUS DPETALLC DPETBLAP DPETHISP
##   <chr>    <chr>    <chr>    <chr>  <chr>       <dbl>    <dbl>    <dbl>    <dbl>
## 1 CAYUGA … 001902   001 AND… 07     A               3      574      4.4     11.5
## 2 ELKHART… 001903   001 AND… 07     A               4     1150      4       11.8
## 3 FRANKST… 001904   001 AND… 07     A               3      808      8.5     11.3
## 4 NECHES … 001906   001 AND… 07     A               2      342      8.2     13.5
## 5 PALESTI… 001907   001 AND… 07     B               6     3360     25.1     42.9
## 6 WESTWOO… 001908   001 AND… 07     B               4     1332     19.7     26.2
## # ℹ 128 more variables: DPETWHIP <dbl>, DPETINDP <dbl>, DPETASIP <dbl>,
## #   DPETPCIP <dbl>, DPETTWOP <dbl>, DPETECOP <dbl>, DPETLEPP <dbl>,
## #   DPETSPEP <dbl>, DPETBILP <dbl>, DPETVOCP <dbl>, DPETGIFP <dbl>,
## #   DA0AT21R <dbl>, DA0912DR21R <dbl>, DAGC4X21R <dbl>, DAGC5X20R <dbl>,
## #   DAGC6X19R <dbl>, DA0GR21N <dbl>, DA0GS21N <dbl>, DDA00A001S22R <dbl>,
## #   DDA00A001222R <dbl>, DDA00A001322R <dbl>, DDA00AR01S22R <dbl>,
## #   DDA00AR01222R <dbl>, DDA00AR01322R <dbl>, DDA00AM01S22R <dbl>, …

Select Variable

# Within the District dataset, I am particatly interested in is the TOTAL ACTUAL OPERATING EXPENDITURES PER PUPIL.
MainVariable <- district$DPFEAOPFK

Descriptive Stats

stat.desc(MainVariable)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 1.202000e+03 0.000000e+00 5.000000e+00 6.755000e+03 1.784670e+05 1.717120e+05 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 1.577175e+07 1.222850e+04 1.312126e+04 1.790370e+02 3.512600e+02 3.852919e+07 
##      std.dev     coef.var 
## 6.207189e+03 4.730635e-01

Remove NA’s

district_clean <- district %>% filter(!is.na(DPFEAOPFK))

Histrogram!!

ggplot(district_clean, aes(x = DPFEAOPFK)) + 
  geom_histogram(binwidth = 5, fill = 'blue', color = 'black') + 
  labs(title = "Histogram of Expenditures", x = "Dollars")

Transform the data

# I am going to both the Log and Sqrt to see which one would make more sense to help make the data seem more normal
district_clean_log <- district_clean %>% mutate(log_expenditure = log(DPFEAOPFK))

district_clean_sqrt <- district_clean %>% mutate(sqrt_expenditure = sqrt(DPFEAOPFK))

Histogram Part 2!! (with transformed data)

ggplot(district_clean_log, aes(x = log_expenditure)) + 
  geom_histogram(binwidth = 0.2, fill = 'green', color = 'black') + 
  labs(title = "Histogram of Log Transformed Expenditure", x = "log_expenditure")

ggplot(district_clean_sqrt, aes(x = sqrt_expenditure)) + 
  geom_histogram(binwidth = 0.2, fill = 'green', color = 'black') + 
  labs(title = "Histogram of Sqrt Transformed Expenditure", x = "sqrt_expenditure")

Thoughts

The log-transformed variable shows a more normal distribution compared to the original data, which had a left skew.