library(readxl)
district <- read_excel("district.xls")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
#create an R markdown with "DISTNAME", "DPFPABILP", "DDH00A001S22R".
obj2 <- district %>% select(DISTNAME,DPFPABILP,DDH00A001S22R)
#remove the missing observations.
obj2_cleaned <- obj2 %>% filter(!is.na(DPFPABILP)&(!is.na(DDH00A001S22R))&(DDH00A001S22R>0))
1.) From the data chosen, select a variable that you are interested in. - The variable I’ve chosen is DPFPABILP: Expenditure % Bilingual/ESL Education.
#2.) Use pastecs::stat.desc to describe the variable. Include a few sentences about what the variable is and what it's measuring.
stat.desc(district$DPFPABILP,norm = T)
## nbr.val nbr.null nbr.na min max range
## 1.202000e+03 1.840000e+02 5.000000e+00 0.000000e+00 2.600000e+01 2.600000e+01
## sum median mean SE.mean CI.mean.0.95 var
## 9.010000e+02 4.000000e-01 7.495840e-01 3.817826e-02 7.490350e-02 1.752011e+00
## std.dev coef.var skewness skew.2SE kurtosis kurt.2SE
## 1.323635e+00 1.765827e+00 9.156793e+00 6.488301e+01 1.414057e+02 5.013990e+02
## normtest.W normtest.p
## 4.668057e-01 1.004389e-50
#The variable used is DPFPABILP and it comes from the TEA 2021-22 district data set. The variable stands for Expenditure % Bilingual/ESL Education. It measures the funds spent on students who are bilingual or are learning English as a second language.
#3.) Remove NA's if needed using a dplyr:filter.
obj2_removed <- district %>% filter(!is.na(DPFPABILP))
#4.) Provide a histogram of the variable.
hist(obj2_removed$DPFPABILP, breaks = 10, probability = T)
lines(density(obj2_removed$DPFPABILP),col= 'red',lwd=2)
#5.) Transform the variable using the log transformation or square root transformation using dplyr.
ggplot(obj2_cleaned,aes(x=log(DPFPABILP), y=log(DDH00A001S22R))) + geom_point() + ggtitle("Log-Transformed Data")
#6.) Provide a histogram of the transformed variable.
hist(obj2_removed$DPFPABILP,breaks = 10,probability = T)
lines(density(obj2_removed$DPFPABILP),col='red',lwd=2)
hist(log(obj2_removed$DPFPABILP),breaks = 10, probability = T)
lines(density(log(obj2_removed$DPFPABILP)),col='red',lwd=2)