## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: ggplot2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'vcd' was built under R version 4.5.2
## Loading required package: grid
head(football)
## winner loser
## 1 50 0
## 2 0 0
## 3 55 0
## 4 24 3
## 5 28 20
## 6 8 7
bins <- seq(-0.5, 76.5, 7.0)
bin.mids <- (bins[-1] + bins[-length(bins)]) / 2
ggplot(football, aes(winner)) +
geom_histogram(breaks = bins,
fill = "white",
color = "red")
fivenum(football$winner)
## [1] 0 21 30 39 73
\[m=\frac{39+21}{2}=30\] \[s=\frac{39-21}{1.349}\approx13\] So, the matching Gaussian curve is N(30,13).
s <- fit.gaussian(football$winner, bins, 30, 13)
options(digits=3)
(df <- data.frame(Mid=bin.mids, d=s$counts, sqrt.d=sqrt(s$counts),
Prob=s$probs, e=s$expected, sqrt.e=sqrt(s$expected),
Residual=s$residual))
## Mid d sqrt.d Prob e sqrt.e Residual
## 1 3 7 2.65 0.02584 12.017 3.467 -0.8208
## 2 10 30 5.48 0.06685 31.086 5.576 -0.0983
## 3 17 58 7.62 0.13028 60.581 7.783 -0.1676
## 4 24 92 9.59 0.19129 88.950 9.431 0.1603
## 5 31 110 10.49 0.21164 98.411 9.920 0.5679
## 6 38 71 8.43 0.17643 82.041 9.058 -0.6315
## 7 45 49 7.00 0.11083 51.534 7.179 -0.1787
## 8 52 26 5.10 0.05245 24.389 4.938 0.1605
## 9 59 15 3.87 0.01870 8.695 2.949 0.9243
## 10 66 6 2.45 0.00502 2.335 1.528 0.9215
## 11 73 1 1.00 0.00102 0.472 0.687 0.3130
p <- ggplot(football, aes(winner)) +
geom_histogram(breaks=bins)
out <- ggplot_build(p)$data[[1]]
select(out, count, x, xmin, xmax)
## count x xmin xmax
## 1 7 3 -0.5 6.5
## 2 30 10 6.5 13.5
## 3 58 17 13.5 20.5
## 4 92 24 20.5 27.5
## 5 110 31 27.5 34.5
## 6 71 38 34.5 41.5
## 7 49 45 41.5 48.5
## 8 26 52 48.5 55.5
## 9 15 59 55.5 62.5
## 10 6 66 62.5 69.5
## 11 1 73 69.5 76.5
ggplot(out, aes(x, sqrt(count))) +
geom_col() +
geom_line(data = df,
aes(bin.mids, sqrt.e), color="red")
rootogram(s$counts, s$expected)
rootogram(s$counts, s$expected, type="deviation")
The residuals are somewhat inconsistent. This dataset has some large
residuals, and the normal curve may not be the best approximation.
football$root.winner <- sqrt(football$winner)
bins <- seq(-0.5, 9.5, 1.0)
bin.mids <- (bins[-1] + bins[-length(bins)]) /2
ggplot(football, aes(root.winner)) +
geom_histogram(breaks=bins,
fill="white",
color="red")
fivenum(football$root.winner)
## [1] 0.00 4.58 5.48 6.24 8.54
\[m=\frac{6.24+4.58}{2}=5.41\] \[s=\frac{6.24-4.58}{1.349}\approx1.23\] So, the corresponding Gaussian curve for the root data is N(5.41, 1.23).
s <- fit.gaussian(football$root.winner, bins, 5.41, 1.23)
options(digits=3)
(df <- data.frame(Mid=bin.mids, d=s$counts, sqrt.d=sqrt(s$counts),
Prob=s$probs, e=s$expected, sqrt.e=sqrt(s$expected),
Residual=s$residual))
## Mid d sqrt.d Prob e sqrt.e Residual
## 1 0 1 1.00 0.000032 0.0149 0.122 0.8780
## 2 1 1 1.00 0.000706 0.3285 0.573 0.4268
## 3 2 5 2.24 0.008255 3.8386 1.959 0.2768
## 4 3 27 5.20 0.051236 23.8246 4.881 0.3151
## 5 4 61 7.81 0.169469 78.8032 8.877 -1.0669
## 6 5 145 12.04 0.299466 139.2515 11.800 0.2411
## 7 6 143 11.96 0.283075 131.6297 11.473 0.4853
## 8 7 65 8.06 0.143119 66.5501 8.158 -0.0956
## 9 8 16 4.00 0.038643 17.9689 4.239 -0.2390
## 10 9 1 1.00 0.005557 2.5842 1.608 -0.6075
p <- ggplot(football, aes(root.winner)) +
geom_histogram(breaks=bins)
out <- ggplot_build(p)$data[[1]]
select(out, count, x, xmin, xmax)
## count x xmin xmax
## 1 1 0 -0.5 0.5
## 2 1 1 0.5 1.5
## 3 5 2 1.5 2.5
## 4 27 3 2.5 3.5
## 5 61 4 3.5 4.5
## 6 145 5 4.5 5.5
## 7 143 6 5.5 6.5
## 8 65 7 6.5 7.5
## 9 16 8 7.5 8.5
## 10 1 9 8.5 9.5
ggplot(out, aes(x, sqrt(count))) +
geom_col() +
geom_line(data=df,
aes(bin.mids, sqrt.e), color="red")
rootogram(s$counts, s$expected)
rootogram(s$counts, s$expected, type="deviation")
The goodness of the fit of the normal curve improves after using the
root reexpression. The significance of the residuals decreases and the
shape of the normal curve fits this reexpressed data better.
head(studentdata)
## Student Height Gender Shoes Number Dvds ToSleep WakeUp Haircut Job Drink
## 1 1 67 female 10 5 10 -2.5 5.5 60 30.0 water
## 2 2 64 female 20 7 5 1.5 8.0 0 20.0 pop
## 3 3 61 female 12 2 6 -1.5 7.5 48 0.0 milk
## 4 4 61 female 3 6 40 2.0 8.5 10 0.0 water
## 5 5 70 male 4 5 6 0.0 9.0 15 17.5 pop
## 6 6 63 female NA 3 5 1.0 8.5 25 0.0 water
w.students <- studentdata[studentdata$Gender == "female",]
head(w.students)
## Student Height Gender Shoes Number Dvds ToSleep WakeUp Haircut Job Drink
## 1 1 67 female 10 5 10 -2.5 5.5 60 30 water
## 2 2 64 female 20 7 5 1.5 8.0 0 20 pop
## 3 3 61 female 12 2 6 -1.5 7.5 48 0 milk
## 4 4 61 female 3 6 40 2.0 8.5 10 0 water
## 6 6 63 female NA 3 5 1.0 8.5 25 0 water
## 7 7 61 female 12 3 53 1.5 7.5 35 20 water
bins <- seq(54, 84, 3)
bin.mids <- (bins[-1] + bins[-length(bins)]) /2
ggplot(w.students, aes(Height)) +
geom_histogram(breaks=bins,
fill="white",
color="red")
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_bin()`).
fivenum(w.students$Height)
## [1] 54.0 63.0 64.5 67.0 84.0
\[m=\frac{67+63}{2}=65\] \[s=\frac{67-63}{1.349}\approx2.97\] So, we will use the Gaussian curve N(65, 2.97).
s <- fit.gaussian(w.students$Height, bins, 65, 2.97)
options(digits=3)
(df <- data.frame(Mid=bin.mids, d=s$counts, sqrt.d=sqrt(s$counts),
Prob=s$probs, e=s$expected, sqrt.e=sqrt(s$expected),
Residual=s$residual))
## Mid d sqrt.d Prob e sqrt.e Residual
## 1 55.5 6 2.45 3.43e-03 1.49e+00 1.22114 1.228
## 2 58.5 25 5.00 4.26e-02 1.85e+01 4.30501 0.695
## 3 61.5 114 10.68 2.04e-01 8.88e+01 9.42497 1.252
## 4 64.5 162 12.73 3.81e-01 1.66e+02 12.88196 -0.154
## 5 67.5 94 9.70 2.79e-01 1.21e+02 11.01953 -1.324
## 6 70.5 21 4.58 7.98e-02 3.47e+01 5.89207 -1.309
## 7 73.5 1 1.00 8.83e-03 3.84e+00 1.96031 -0.960
## 8 76.5 4 2.00 3.74e-04 1.63e-01 0.40331 1.597
## 9 79.5 0 0.00 5.98e-06 2.60e-03 0.05099 -0.051
## 10 82.5 1 1.00 3.57e-08 1.55e-05 0.00394 0.996
p <- ggplot(w.students, aes(Height)) +
geom_histogram(breaks=bins)
out <- ggplot_build(p)$data[[1]]
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_bin()`).
select(out, count, x, xmin, xmax)
## count x xmin xmax
## 1 6 55.5 54 57
## 2 25 58.5 57 60
## 3 114 61.5 60 63
## 4 162 64.5 63 66
## 5 94 67.5 66 69
## 6 21 70.5 69 72
## 7 1 73.5 72 75
## 8 4 76.5 75 78
## 9 0 79.5 78 81
## 10 1 82.5 81 84
ggplot(out, aes(x, sqrt(count))) +
geom_col() +
geom_line(data = df,
aes(bin.mids, sqrt.e), color="red")
rootogram(s$counts, s$expected)
rootogram(s$counts, s$expected, type="deviation")