library("psych")
describe(df) # summary stats of data set
## vars n mean sd median trimmed mad min max range
## PassengerId 1 332 1100.06 122.76 1099.5 1100.41 161.60 892.00 1307.00 415.00
## Pclass 2 332 2.14 0.85 2.0 2.18 1.48 1.00 3.00 2.00
## Name* 3 332 166.50 95.98 166.5 166.50 123.06 1.00 332.00 331.00
## Sex* 4 332 1.62 0.49 2.0 1.65 0.00 1.00 2.00 1.00
## Age 5 332 30.27 14.18 27.0 29.64 11.86 0.17 76.00 75.83
## SibSp 6 332 0.48 0.87 0.0 0.32 0.00 0.00 8.00 8.00
## Parch 7 332 0.40 0.81 0.0 0.20 0.00 0.00 6.00 6.00
## Ticket* 8 332 141.68 84.33 142.5 141.53 112.68 1.00 285.00 284.00
## Fare 9 331 40.98 61.23 16.0 25.63 12.41 0.00 512.33 512.33
## Cabin* 10 332 10.41 18.93 1.0 5.59 0.00 1.00 73.00 72.00
## Embarked* 11 332 2.44 0.86 3.0 2.55 0.00 1.00 3.00 2.00
## skew kurtosis se
## PassengerId -0.01 -1.25 6.74
## Pclass -0.28 -1.55 0.05
## Name* 0.00 -1.21 5.27
## Sex* -0.48 -1.77 0.03
## Age 0.45 0.05 0.78
## SibSp 3.53 20.02 0.05
## Parch 2.76 10.56 0.04
## Ticket* 0.00 -1.29 4.63
## Fare 3.26 13.84 3.37
## Cabin* 1.92 2.37 1.04
## Embarked* -0.97 -0.95 0.05
#install.packages("visdat")
library(visdat)
# vis_miss(df)
vis_dat(df)
#install.packages("stargazer")
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(df, type = "text" )
##
## ================================================
## Statistic N Mean St. Dev. Min Max
## ------------------------------------------------
## PassengerId 332 1,100.063 122.763 892 1,307
## Pclass 332 2.145 0.846 1 3
## Age 332 30.273 14.181 0.170 76.000
## SibSp 332 0.482 0.874 0 8
## Parch 332 0.398 0.811 0 6
## Fare 331 40.982 61.229 0.000 512.329
## ------------------------------------------------
variable_labels <- c("Passenger Id",
"Passenger Class",
"Age",
"# of Siblings/Spouses",
"# of Children/Parents",
"Fare")
data_notes <- c("N = 332, because 418 responses omitted due to missing values.", "Age has 86 missing values.",
"Fare has 1 missing value.")
class(variable_labels)
## [1] "character"
length(variable_labels)
## [1] 6
stargazer(df,
type = "text",
title = "Summary Statistics",
covariate.labels = variable_labels,
notes = data_notes,
omit.summary.stat = "n",
digits = 2)
##
## Summary Statistics
## =================================================================
## Statistic Mean St. Dev. Min Max
## -----------------------------------------------------------------
## Passenger Id 1,100.06 122.76 892 1,307
## Passenger Class 2.14 0.85 1 3
## Age 30.27 14.18 0.17 76.00
## # of Siblings/Spouses 0.48 0.87 0 8
## # of Children/Parents 0.40 0.81 0 6
## Fare 40.98 61.23 0.00 512.33
## -----------------------------------------------------------------
## N = 332, because 418 responses omitted due to missing values.
## Age has 86 missing values.
## Fare has 1 missing value.
The average age was about 30, so, with the maximum being 76 and the minimum being 0.17, the distribution of passenger’s ages is skewed-right.
Similarly, a large number of passenger fares must have been relatively low / the distribution was most likely skewed-right because the mean is closer t the minimum than the maximum.
data <- df$Age # Name the sub category
median(data)
## [1] 27
mode(data)
## [1] "numeric"
mean(data)
## [1] 30.27259
hist(data)
?sd
sd(x = data)
## [1] 14.18121
var(x = data)
## [1] 201.1067
var(data) == sd(x = data)^2
## [1] FALSE
max(data)
## [1] 76
min(data)
## [1] 0.17
range(data)
## [1] 0.17 76.00
max(data) - min(data)
## [1] 75.83
round(x = sd(x = data), digits = 2)
## [1] 14.18
?boxplot
boxplot(data, horizontal = T)
?quartile
## No documentation for 'quartile' in specified packages and libraries:
## you could try '??quartile'
summary(data)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.17 21.00 27.00 30.27 39.00 76.00
IQR(data)
## [1] 18
boxplot(data, horizontal = T, range = 3)
?boxplot
boxplot(data,
vertical = TRUE,
col = "blue",
range = 3)
boxplot(data,
horizontal = TRUE,
col = "pink",
range = 1.5,
xlab = "Passenger Ages",
border = "orange")