library(readxl)
Training <- read_excel("C:/Users/celin/OneDrive/Desktop/Applied Quant Class/Training.xls")
View(Training)

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pastecs)
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
Training_Cleaned<-Training
Training_Cleaned$`All Students (7/1/2018 - 6/30/2022)`<-as.numeric(Training_Cleaned$`All Students (7/1/2018 - 6/30/2022)`)
## Warning: NAs introduced by coercion
Training_Cleaned$`Percent Received Credential`<-as.numeric(Training_Cleaned$`Percent Received Credential`)
## Warning: NAs introduced by coercion
Training_Cleaned$`All Students Percent Successfully Completed`<-as.numeric(Training_Cleaned$`All Students Successfully Completed Program(7/1/2018 - 6/30/2022)`)
## Warning: NAs introduced by coercion
Training_Cleaned_Filtered<-Training_Cleaned %>% select(`County`,`Provider
Name`,`All Students (7/1/2018 - 6/30/2022)`,`Percent Received Credential`,`All Students Percent Successfully Completed`) %>% na.omit
pastecs::stat.desc(Training_Cleaned_Filtered$`All Students (7/1/2018 - 6/30/2022)`)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 1.912000e+03 0.000000e+00 0.000000e+00 5.000000e+00 6.366000e+03 6.361000e+03 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 4.066900e+05 1.030000e+02 2.127040e+02 8.621951e+00 1.690942e+01 1.421343e+05 
##      std.dev     coef.var 
## 3.770071e+02 1.772450e+00
hist(Training_Cleaned_Filtered$`All Students (7/1/2018 - 6/30/2022)`,breaks=10,probability = T)
lines(density(Training_Cleaned_Filtered$`All Students (7/1/2018 - 6/30/2022)`),col='red',lwd=2)

ggplot(Training_Cleaned_Filtered,aes(x=`All Students (7/1/2018 - 6/30/2022)`,y=`All Students Percent Successfully Completed`)) + geom_point() + ggtitle("Previous")

Previous_sqrt<-Training_Cleaned_Filtered %>% mutate(ALLSTUDENTSSQRT=sqrt(`All Students (7/1/2018 - 6/30/2022)`)) %>% select(`All Students (7/1/2018 - 6/30/2022)`,ALLSTUDENTSSQRT)
head(Previous_sqrt)
## # A tibble: 6 × 2
##   `All Students (7/1/2018 - 6/30/2022)` ALLSTUDENTSSQRT
##                                   <dbl>           <dbl>
## 1                                    38            6.16
## 2                                    27            5.20
## 3                                    12            3.46
## 4                                   198           14.1 
## 5                                   134           11.6 
## 6                                    20            4.47
hist(Previous_sqrt$ALLSTUDENTSSQRT,breaks=10,probability = T)
lines(density(Previous_sqrt$ALLSTUDENTSSQRT),col='red',lwd=2)