remove(list = ls())
#install.packages("visdat")
library(visdat)
test <- read.csv("~/Desktop/BCE/R FILES/test.csv")
df <- test
head(df)
## PassengerId Pclass Name Sex Age
## 1 892 3 Kelly, Mr. James male 34.5
## 2 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0
## 3 894 2 Myles, Mr. Thomas Francis male 62.0
## 4 895 3 Wirz, Mr. Albert male 27.0
## 5 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0
## 6 897 3 Svensson, Mr. Johan Cervin male 14.0
## SibSp Parch Ticket Fare Cabin Embarked
## 1 0 0 330911 7.8292 Q
## 2 1 0 363272 7.0000 S
## 3 0 0 240276 9.6875 Q
## 4 0 0 315154 8.6625 S
## 5 1 1 3101298 12.2875 S
## 6 0 0 7538 9.2250 S
# vis_miss(df)
vis_dat(df)
library(psych)
head(df)
## PassengerId Pclass Name Sex Age
## 1 892 3 Kelly, Mr. James male 34.5
## 2 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0
## 3 894 2 Myles, Mr. Thomas Francis male 62.0
## 4 895 3 Wirz, Mr. Albert male 27.0
## 5 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0
## 6 897 3 Svensson, Mr. Johan Cervin male 14.0
## SibSp Parch Ticket Fare Cabin Embarked
## 1 0 0 330911 7.8292 Q
## 2 1 0 363272 7.0000 S
## 3 0 0 240276 9.6875 Q
## 4 0 0 315154 8.6625 S
## 5 1 1 3101298 12.2875 S
## 6 0 0 7538 9.2250 S
colSums(is.na(df)) # tells us there are 86 missing values in age and 1 missing variable in fare
## PassengerId Pclass Name Sex Age SibSp
## 0 0 0 0 86 0
## Parch Ticket Fare Cabin Embarked
## 0 0 1 0 0
df_clean <- na.omit(df)
head(df_clean)
## PassengerId Pclass Name Sex Age
## 1 892 3 Kelly, Mr. James male 34.5
## 2 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0
## 3 894 2 Myles, Mr. Thomas Francis male 62.0
## 4 895 3 Wirz, Mr. Albert male 27.0
## 5 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0
## 6 897 3 Svensson, Mr. Johan Cervin male 14.0
## SibSp Parch Ticket Fare Cabin Embarked
## 1 0 0 330911 7.8292 Q
## 2 1 0 363272 7.0000 S
## 3 0 0 240276 9.6875 Q
## 4 0 0 315154 8.6625 S
## 5 1 1 3101298 12.2875 S
## 6 0 0 7538 9.2250 S
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(df)
##
## % Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
## % Date and time: Fri, Jul 26, 2024 - 08:32:40
## \begin{table}[!htbp] \centering
## \caption{}
## \label{}
## \begin{tabular}{@{\extracolsep{5pt}}lccccc}
## \\[-1.8ex]\hline
## \hline \\[-1.8ex]
## Statistic & \multicolumn{1}{c}{N} & \multicolumn{1}{c}{Mean} & \multicolumn{1}{c}{St. Dev.} & \multicolumn{1}{c}{Min} & \multicolumn{1}{c}{Max} \\
## \hline \\[-1.8ex]
## PassengerId & 418 & 1,100.500 & 120.810 & 892 & 1,309 \\
## Pclass & 418 & 2.266 & 0.842 & 1 & 3 \\
## Age & 332 & 30.273 & 14.181 & 0.170 & 76.000 \\
## SibSp & 418 & 0.447 & 0.897 & 0 & 8 \\
## Parch & 418 & 0.392 & 0.981 & 0 & 9 \\
## Fare & 417 & 35.627 & 55.908 & 0.000 & 512.329 \\
## \hline \\[-1.8ex]
## \end{tabular}
## \end{table}
stargazer(df, type = "text", title = "Summary Statistics")
##
## Summary Statistics
## ================================================
## Statistic N Mean St. Dev. Min Max
## ------------------------------------------------
## PassengerId 418 1,100.500 120.810 892 1,309
## Pclass 418 2.266 0.842 1 3
## Age 332 30.273 14.181 0.170 76.000
## SibSp 418 0.447 0.897 0 8
## Parch 418 0.392 0.981 0 9
## Fare 417 35.627 55.908 0.000 512.329
## ------------------------------------------------
stargazer(df, type = "text", title = "Summary Statistics", covariate.labels = c("Passenger Id", "Passenger Class", "Age", "# of Siblings", "# of Children or Parents", "Fare"))
##
## Summary Statistics
## =============================================================
## Statistic N Mean St. Dev. Min Max
## -------------------------------------------------------------
## Passenger Id 418 1,100.500 120.810 892 1,309
## Passenger Class 418 2.266 0.842 1 3
## Age 332 30.273 14.181 0.170 76.000
## # of Siblings 418 0.447 0.897 0 8
## # of Children or Parents 418 0.392 0.981 0 9
## Fare 417 35.627 55.908 0.000 512.329
## -------------------------------------------------------------
stargazer(df, type = "text",
title = "Summary Statistics",
covariate.labels = c("Passenger Id", "Passenger Class", "Age", "# of Siblings", "# of Children or Parents", "Fare"),
notes = c("N = 418", "Age has 86 missing values", "Fare has 1 missing value"),
omit.summary.stat = "N",
digits = 2)
##
## Summary Statistics
## ======================================================
## Statistic Mean St. Dev. Min Max
## ------------------------------------------------------
## Passenger Id 1,100.50 120.81 892 1,309
## Passenger Class 2.27 0.84 1 3
## Age 30.27 14.18 0.17 76.00
## # of Siblings 0.45 0.90 0 8
## # of Children or Parents 0.39 0.98 0 9
## Fare 35.63 55.91 0.00 512.33
## ------------------------------------------------------
## N = 418
## Age has 86 missing values
## Fare has 1 missing value
One key observation I had was that the number of siblings as well as the number of Children or Parents was skewed right as the mean of the values was closer to the minimum rather than the maximum.
?boxplot
data <- df$Age
boxplot(data, horizontal = T)
head(data)
## [1] 34.5 47.0 62.0 27.0 22.0 14.0
summary(data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.17 21.00 27.00 30.27 39.00 76.00 86
boxplot(data, horizontal = T, range = 3)
data <- df$Age
mean(data)
## [1] NA
median(data)
## [1] NA
mode(data)
## [1] "numeric"
hist(data)
These charts demonstrate how the data is skewed right as the majority of the data is located on the left side of the graph and the mean is greater than the median.