library(LearnEDAfunctions)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: ggplot2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(vcd)
## Warning: package 'vcd' was built under R version 4.5.2
## Loading required package: grid
data("football")
football_df <- as.data.frame(football)
names(football_df) <- c("team1", "team2")
football_df$team1 <- as.numeric(football_df$team1)
football_df$team2 <- as.numeric(football_df$team2)
bins <- seq(-0.5, max(football_df$team1) + 6.5, by = 7)
bin.mids <- (bins[-1] + bins[-length(bins)]) / 2
ggplot(football_df, aes(team1)) +
geom_histogram(breaks = bins,
fill = "white",
color = "red") +
labs(title = "Histogram of Winning Team Scores",
x = "team1 score",
y = "count")

FL <- fivenum(football_df$team1)[2]
FU <- fivenum(football_df$team1)[4]
m <- (FL + FU) / 2
s <- (FU - FL) / 1.349
m; s
## [1] 30
## [1] 13.34322
p <- ggplot(football_df, aes(team1)) +
geom_histogram(breaks = bins)
out <- ggplot_build(p)$data[[1]] %>%
transmute(count,
x = (xmin + xmax)/2)
out <- mutate(out, ROOT = sqrt(count))
ggplot(out, aes(x, ROOT)) +
geom_col() +
labs(title = "Rootogram of team1 Scores",
x = "Score midpoint",
y = "sqrt(count)")

fit <- fit.gaussian(football_df$team1, bins, m, s)
rootogram(fit$counts, fit$expected, type = "hanging")

rootogram(fit$counts, fit$expected, type = "deviation")

Question 2
data("studentdata")
women <- subset(studentdata, Gender == "female")
women$Height[is.na(women$Height)] <- median(women$Height, na.rm = TRUE)
bins <- seq(floor(min(women$Height)),
ceiling(max(women$Height)),
by = 3)
p <- ggplot(women, aes(Height)) +
geom_histogram(breaks = bins)
out <- ggplot_build(p)$data[[1]] %>%
transmute(
count,
x = (xmin + xmax) / 2 )
out <- mutate(out, ROOT = sqrt(count))
ggplot(out, aes(x, ROOT)) +
geom_line() +
xlab("Bin Mid Points") +
ylab("Root Counts")

out <- mutate(out,
Smooth.Root = as.vector(smooth(ROOT, twiceit = TRUE)))
ggplot(out, aes(x, ROOT)) +
geom_line() +
geom_line(aes(x, Smooth.Root), color = "red") +
xlab("Bin Mid Points") +
ylab("Root Counts")

peak <- out$x[which.max(out$Smooth.Root)]
out <- mutate(out, Shift.Sq = round((x - peak)^2, 2))
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
p1 <- ggplot(out, aes(Shift.Sq, power.t(Smooth.Root, 0.5))) + geom_point() + ggtitle("Power = 0.5")
p2 <- ggplot(out, aes(Shift.Sq, power.t(Smooth.Root, 0))) + geom_point() + ggtitle("Power = 0")
p3 <- ggplot(out, aes(Shift.Sq, power.t(Smooth.Root, -0.5))) + geom_point() + ggtitle("Power = -0.5")
p4 <- ggplot(out, aes(Shift.Sq, power.t(Smooth.Root, -1))) + geom_point() + ggtitle("Power = -1")
grid.arrange(p1, p2, p3, p4)

fit <- lm(I(Smooth.Root^(-0.75)) ~ Shift.Sq, data = out)
b <- coef(fit)
g <- b[1] + b[2] * out$Shift.Sq
g[g <= 0] <- NA
out <- mutate(out,
Final.Smooth = g^(-2 / 0.75),
Final.S.Root = sqrt(Final.Smooth))
ggplot(out, aes(x, ROOT)) +
geom_line() +
geom_line(aes(x, Final.S.Root), color = "red") +
xlab("Bin Mid Points") +
ylab("Root Counts")

out <- mutate(out, Residual = ROOT - Final.S.Root)
ggplot(out, aes(x, Residual)) +
geom_point() +
geom_hline(yintercept = 0) +
xlab("Bin Mid Points") +
ylab("Residuals")
