Using vis_miss, we can find the missing data in the dataset
train <- read.csv("C:/Users/Adi/Desktop/temp for school/train.csv")
df <- train
library("psych")
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("visdat")
library("stargazer")
##
## Please cite as:
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
glimpse(df, type = text)
## Rows: 891
## Columns: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Survived <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
## $ Pclass <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
## $ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
## $ Sex <chr> "male", "female", "female", "female", "male", "male", "mal…
## $ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
## $ SibSp <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
## $ Parch <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
## $ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
## $ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
## $ Cabin <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C…
## $ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"…
vis_miss(df)
vis_dat(df)
x <- na.omit(df)
df_clean <- x
The 2 Numbers outside the summary are the median and mean of the data
x2 <- mean(df$Age, na.rm = TRUE)
y2 <- median(df$Age, na.rm = TRUE)
stargazer(df_clean, type = "text", title = "Corrected Titanic Survivor Statistics", notes = "Age is missing 177 observations while every other variable has 891 observations", omit.summary.stat = "n")
##
## Corrected Titanic Survivor Statistics
## ===================================================================================
## Statistic Mean St. Dev. Min Max
## -----------------------------------------------------------------------------------
## PassengerId 448.583 259.120 1 891
## Survived 0.406 0.491 0 1
## Pclass 2.237 0.838 1 3
## Age 29.699 14.526 0.420 80.000
## SibSp 0.513 0.930 0 5
## Parch 0.431 0.853 0 6
## Fare 34.695 52.919 0.000 512.329
## -----------------------------------------------------------------------------------
## Age is missing 177 observations while every other variable has 891 observations
?boxplot
## starting httpd help server ... done
set.seed(7)
df_clean <- rnorm(200)
boxplot(df_clean, horizontal = TRUE)
par(mar=c(4, 3.1, 1.1, 2.1))
?hist
hist(x$Age ,
breaks = 10 ,
col = rgb(0.2,0.8,0.5,0.5) ,
border = T ,
main = "" ,
xlab = "Age",
xlim = c(0,100)
)
?col