Homework 4

library(readxl)
district <- read_excel("district.xls")
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(pastecs)

## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

#create an R markdown with "DISTNAME", "DPFPABILP", "DDH00A001S22R".
obj2 <- district %>% select(DISTNAME,DPFPABILP,DDH00A001S22R)

#remove the missing observations.
obj2_cleaned <- obj2 %>% filter(!is.na(DPFPABILP)&(!is.na(DDH00A001S22R))&(DDH00A001S22R>0))

1.) From the data chosen, select a variable that you are interested in. - The variable I’ve chosen is DPFPABILP: Expenditure % Bilingual/ESL Education.

#2.) Use pastecs::stat.desc to describe the variable. Include a few sentences about what the variable is and what it's measuring.

stat.desc(district$DPFPABILP,norm = T)

##      nbr.val     nbr.null       nbr.na          min          max        range 
## 1.202000e+03 1.840000e+02 5.000000e+00 0.000000e+00 2.600000e+01 2.600000e+01 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 9.010000e+02 4.000000e-01 7.495840e-01 3.817826e-02 7.490350e-02 1.752011e+00 
##      std.dev     coef.var     skewness     skew.2SE     kurtosis     kurt.2SE 
## 1.323635e+00 1.765827e+00 9.156793e+00 6.488301e+01 1.414057e+02 5.013990e+02 
##   normtest.W   normtest.p 
## 4.668057e-01 1.004389e-50

#The variable used is DPFPABILP and it comes from the TEA 2021-22 district data set. The variable stands for Expenditure % Bilingual/ESL Education. It measures the funds spent on students who are bilingual or are learning English as a second language.

#3.) Remove NA's if needed using a dplyr:filter.
obj2_removed <- district %>% filter(!is.na(DPFPABILP))

#4.) Provide a histogram of the variable.
hist(obj2_removed$DPFPABILP, breaks = 10, probability = T)
lines(density(obj2_removed$DPFPABILP),col= 'red',lwd=2)

#5.) Transform the variable using the log transformation or square root transformation using dplyr.
ggplot(obj2_cleaned,aes(x=log(DPFPABILP), y=log(DDH00A001S22R))) + geom_point() + ggtitle("Log-Transformed Data")

#6.) Provide a histogram of the transformed variable.
hist(obj2_removed$DPFPABILP,breaks = 10,probability = T)
lines(density(obj2_removed$DPFPABILP),col='red',lwd=2)

hist(log(obj2_removed$DPFPABILP),breaks = 10, probability = T)
lines(density(log(obj2_removed$DPFPABILP)),col='red',lwd=2)

Homework 4

Sarah Rodriguez

2024-10-05