library(MASS)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(rstatix)
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:janitor':
##
## make_clean_names
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:stats':
##
## filter
boston_data <- Boston
colnames(boston_data) <- tolower(colnames(boston_data))
head(boston_data, 7)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0.0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## 7 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.5605 5 311 15.2 395.60 12.43
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
## 7 22.9
summary(boston_data)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
str(boston_data)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
boston_data$rad <- as.factor(boston_data$rad)
boston_data$zn <- as.factor(boston_data$zn)
tabyl(boston_data, rad)
## rad n percent
## 1 20 0.03952569
## 2 24 0.04743083
## 3 38 0.07509881
## 4 110 0.21739130
## 5 115 0.22727273
## 6 26 0.05138340
## 7 17 0.03359684
## 8 24 0.04743083
## 24 132 0.26086957
tabyl(boston_data, zn)
## zn n percent
## 0 372 0.735177866
## 12.5 10 0.019762846
## 17.5 1 0.001976285
## 18 1 0.001976285
## 20 21 0.041501976
## 21 4 0.007905138
## 22 10 0.019762846
## 25 10 0.019762846
## 28 3 0.005928854
## 30 6 0.011857708
## 33 4 0.007905138
## 34 3 0.005928854
## 35 3 0.005928854
## 40 7 0.013833992
## 45 6 0.011857708
## 52.5 3 0.005928854
## 55 3 0.005928854
## 60 4 0.007905138
## 70 3 0.005928854
## 75 3 0.005928854
## 80 15 0.029644269
## 82.5 2 0.003952569
## 85 2 0.003952569
## 90 5 0.009881423
## 95 4 0.007905138
## 100 1 0.001976285
tabyl(boston_data, rad, zn)
## rad 0 12.5 17.5 18 20 21 22 25 28 30 33 34 35 40 45 52.5 55 60 70 75 80 82.5
## 1 6 0 0 1 0 0 0 0 0 0 0 0 3 2 0 0 1 2 0 0 3 0
## 2 18 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 2
## 3 26 0 1 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0
## 4 77 3 0 0 0 4 0 4 3 0 0 0 0 5 0 0 0 2 0 0 9 0
## 5 78 7 0 0 16 0 0 0 0 0 0 0 0 0 6 0 2 0 3 0 0 0
## 6 17 0 0 0 0 0 0 0 0 6 0 0 0 0 0 3 0 0 0 0 0 0
## 7 0 0 0 0 0 0 10 0 0 0 4 3 0 0 0 0 0 0 0 0 0 0
## 8 18 0 0 0 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 24 132 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 85 90 95 100
## 0 2 0 0
## 1 0 0 0
## 0 1 2 0
## 1 0 2 0
## 0 2 0 1
## 0 0 0 0
## 0 0 0 0
## 0 0 0 0
## 0 0 0 0
boston_data |>
group_by(rm) |>
summarise(MeanValue = mean(medv, na.rm = TRUE), SDValue = sd(medv, na.rm = TRUE))
## # A tibble: 446 × 3
## rm MeanValue SDValue
## <dbl> <dbl> <dbl>
## 1 3.56 27.5 NA
## 2 3.86 23.1 NA
## 3 4.14 12.8 1.34
## 4 4.37 8.8 NA
## 5 4.52 7 NA
## 6 4.63 17.9 NA
## 7 4.65 10.5 NA
## 8 4.88 10.2 NA
## 9 4.90 11.8 NA
## 10 4.91 13.8 NA
## # ℹ 436 more rows
boston_data |>
select(rm, medv) |>
get_summary_stats(type = "full")
## # A tibble: 2 × 13
## variable n min max median q1 q3 iqr mad mean sd se
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 rm 506 3.56 8.78 6.21 5.89 6.62 0.738 0.512 6.28 0.703 0.031
## 2 medv 506 5 50 21.2 17.0 25 7.98 5.93 22.5 9.20 0.409
## # ℹ 1 more variable: ci <dbl>