library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
my_data<-read_excel(file.choose())
names(my_data)
## [1] "State"              "Scores (4th Grade)" "Year"              
## [4] "Subject"            "Treated"

The variable Scores (4th Grade) represents student performance scores for 4th grade students. It measures academic achievement levels across different states and years.

library(pastecs)
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## The following object is masked from 'package:tidyr':
## 
##     extract
stat.desc(my_data$`Scores (4th Grade)`)
##  nbr.val nbr.null   nbr.na      min      max    range      sum   median 
##       NA       NA       NA       NA       NA       NA       NA       NA 
##     mean  SE.mean  CI.mean      var  std.dev coef.var 
##       NA       NA       NA       NA       NA       NA
str(my_data)
## tibble [104 × 5] (S3: tbl_df/tbl/data.frame)
##  $ State             : chr [1:104] "Missouri" "Indiana" "Ohio" "Kentucky" ...
##  $ Scores (4th Grade): chr [1:104] "222" "221" "219" "215" ...
##  $ Year              : num [1:104] 1992 1992 1992 1992 1992 ...
##  $ Subject           : chr [1:104] "Math" "Math" "Math" "Math" ...
##  $ Treated           : num [1:104] 0 1 1 0 1 0 0 1 1 0 ...
library(dplyr)

my_data <- my_data %>%
  mutate(`Scores (4th Grade)` = as.numeric(`Scores (4th Grade)`))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Scores (4th Grade) = as.numeric(`Scores (4th Grade)`)`.
## Caused by warning:
## ! NAs introduced by coercion
str(my_data)
## tibble [104 × 5] (S3: tbl_df/tbl/data.frame)
##  $ State             : chr [1:104] "Missouri" "Indiana" "Ohio" "Kentucky" ...
##  $ Scores (4th Grade): num [1:104] 222 221 219 215 214 211 208 204 229 225 ...
##  $ Year              : num [1:104] 1992 1992 1992 1992 1992 ...
##  $ Subject           : chr [1:104] "Math" "Math" "Math" "Math" ...
##  $ Treated           : num [1:104] 0 1 1 0 1 0 0 1 1 0 ...
library(pastecs)

stat.desc(my_data$`Scores (4th Grade)`)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 1.020000e+02 0.000000e+00 2.000000e+00 2.040000e+02 2.490000e+02 4.500000e+01 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 2.377800e+04 2.340000e+02 2.331176e+02 9.862473e-01 1.956449e+00 9.921374e+01 
##      std.dev     coef.var 
## 9.960610e+00 4.272782e-02
library(dplyr)

clean_data <- my_data %>%
  filter(!is.na(`Scores (4th Grade)`))
library(ggplot2)

ggplot(clean_data, aes(x = `Scores (4th Grade)`)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black")

The distribution of 4th grade scores is roughly centered around the mid-230s, with most scores clustered between about 225 and 245 and relatively few very low or very high values.

clean_data <- clean_data %>%
  mutate(log_scores = log(`Scores (4th Grade)`))
ggplot(clean_data, aes(x = log_scores)) +
  geom_histogram(binwidth = 0.05, fill = "green", color = "black") +
  labs(title = "Histogram of Log(4th Grade Scores)",
       x = "Log(Scores)",
       y = "Frequency")

After applying a log transformation, the distribution of 4th grade scores becomes more compressed and more evenly shaped. The values are more concentrated, and the transformation reduces the effect of skewness seen in the original histogram.Compared to the original histogram, the log-transformed distribution is more balanced and shows reduced variability.