Day 3 Markdown and Basic Graph

Cleaning the Data

remove(list = ls())

#install.packages("visdat")

library(visdat)

test <- read.csv("~/Desktop/BCE/R FILES/test.csv")
df <- test

head(df)
##   PassengerId Pclass                                         Name    Sex  Age
## 1         892      3                             Kelly, Mr. James   male 34.5
## 2         893      3             Wilkes, Mrs. James (Ellen Needs) female 47.0
## 3         894      2                    Myles, Mr. Thomas Francis   male 62.0
## 4         895      3                             Wirz, Mr. Albert   male 27.0
## 5         896      3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0
## 6         897      3                   Svensson, Mr. Johan Cervin   male 14.0
##   SibSp Parch  Ticket    Fare Cabin Embarked
## 1     0     0  330911  7.8292              Q
## 2     1     0  363272  7.0000              S
## 3     0     0  240276  9.6875              Q
## 4     0     0  315154  8.6625              S
## 5     1     1 3101298 12.2875              S
## 6     0     0    7538  9.2250              S
# vis_miss(df)
vis_dat(df)

library(psych)

head(df)
##   PassengerId Pclass                                         Name    Sex  Age
## 1         892      3                             Kelly, Mr. James   male 34.5
## 2         893      3             Wilkes, Mrs. James (Ellen Needs) female 47.0
## 3         894      2                    Myles, Mr. Thomas Francis   male 62.0
## 4         895      3                             Wirz, Mr. Albert   male 27.0
## 5         896      3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0
## 6         897      3                   Svensson, Mr. Johan Cervin   male 14.0
##   SibSp Parch  Ticket    Fare Cabin Embarked
## 1     0     0  330911  7.8292              Q
## 2     1     0  363272  7.0000              S
## 3     0     0  240276  9.6875              Q
## 4     0     0  315154  8.6625              S
## 5     1     1 3101298 12.2875              S
## 6     0     0    7538  9.2250              S
colSums(is.na(df)) # tells us there are 86 missing values in age and 1 missing                            variable in fare
## PassengerId      Pclass        Name         Sex         Age       SibSp 
##           0           0           0           0          86           0 
##       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           1           0           0
df_clean <- na.omit(df)

head(df_clean)
##   PassengerId Pclass                                         Name    Sex  Age
## 1         892      3                             Kelly, Mr. James   male 34.5
## 2         893      3             Wilkes, Mrs. James (Ellen Needs) female 47.0
## 3         894      2                    Myles, Mr. Thomas Francis   male 62.0
## 4         895      3                             Wirz, Mr. Albert   male 27.0
## 5         896      3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0
## 6         897      3                   Svensson, Mr. Johan Cervin   male 14.0
##   SibSp Parch  Ticket    Fare Cabin Embarked
## 1     0     0  330911  7.8292              Q
## 2     1     0  363272  7.0000              S
## 3     0     0  240276  9.6875              Q
## 4     0     0  315154  8.6625              S
## 5     1     1 3101298 12.2875              S
## 6     0     0    7538  9.2250              S
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(df)
## 
## % Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
## % Date and time: Fri, Jul 26, 2024 - 08:32:40
## \begin{table}[!htbp] \centering 
##   \caption{} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}}lccccc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
## Statistic & \multicolumn{1}{c}{N} & \multicolumn{1}{c}{Mean} & \multicolumn{1}{c}{St. Dev.} & \multicolumn{1}{c}{Min} & \multicolumn{1}{c}{Max} \\ 
## \hline \\[-1.8ex] 
## PassengerId & 418 & 1,100.500 & 120.810 & 892 & 1,309 \\ 
## Pclass & 418 & 2.266 & 0.842 & 1 & 3 \\ 
## Age & 332 & 30.273 & 14.181 & 0.170 & 76.000 \\ 
## SibSp & 418 & 0.447 & 0.897 & 0 & 8 \\ 
## Parch & 418 & 0.392 & 0.981 & 0 & 9 \\ 
## Fare & 417 & 35.627 & 55.908 & 0.000 & 512.329 \\ 
## \hline \\[-1.8ex] 
## \end{tabular} 
## \end{table}
stargazer(df, type = "text", title = "Summary Statistics")
## 
## Summary Statistics
## ================================================
## Statistic    N    Mean    St. Dev.  Min    Max  
## ------------------------------------------------
## PassengerId 418 1,100.500 120.810   892   1,309 
## Pclass      418   2.266    0.842     1      3   
## Age         332  30.273    14.181  0.170 76.000 
## SibSp       418   0.447    0.897     0      8   
## Parch       418   0.392    0.981     0      9   
## Fare        417  35.627    55.908  0.000 512.329
## ------------------------------------------------
stargazer(df, type = "text", title = "Summary Statistics", covariate.labels = c("Passenger Id", "Passenger Class", "Age", "# of Siblings", "# of Children or Parents", "Fare"))
## 
## Summary Statistics
## =============================================================
## Statistic                 N    Mean    St. Dev.  Min    Max  
## -------------------------------------------------------------
## Passenger Id             418 1,100.500 120.810   892   1,309 
## Passenger Class          418   2.266    0.842     1      3   
## Age                      332  30.273    14.181  0.170 76.000 
## # of Siblings            418   0.447    0.897     0      8   
## # of Children or Parents 418   0.392    0.981     0      9   
## Fare                     417  35.627    55.908  0.000 512.329
## -------------------------------------------------------------
stargazer(df, type          = "text", 
          title             = "Summary Statistics", 
          covariate.labels  = c("Passenger Id", "Passenger Class", "Age", "# of Siblings", "# of Children or Parents", "Fare"), 
          notes             = c("N = 418", "Age has 86 missing values", "Fare has 1 missing value"), 
          omit.summary.stat = "N", 
          digits            = 2)
## 
## Summary Statistics
## ======================================================
## Statistic                  Mean   St. Dev. Min   Max  
## ------------------------------------------------------
## Passenger Id             1,100.50  120.81  892  1,309 
## Passenger Class            2.27     0.84    1     3   
## Age                       30.27    14.18   0.17 76.00 
## # of Siblings              0.45     0.90    0     8   
## # of Children or Parents   0.39     0.98    0     9   
## Fare                      35.63    55.91   0.00 512.33
## ------------------------------------------------------
## N = 418                                               
## Age has 86 missing values                             
## Fare has 1 missing value

Describe at least 1 key observation and at most 3 key observations as text in your final output file (not as comments within in your R chunk).

One key observation I had was that the number of siblings as well as the number of Children or Parents was skewed right as the mean of the values was closer to the minimum rather than the maximum.

Creating Box Plot

?boxplot

data <- df$Age

boxplot(data, horizontal = T)

head(data)
## [1] 34.5 47.0 62.0 27.0 22.0 14.0
summary(data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.17   21.00   27.00   30.27   39.00   76.00      86
boxplot(data, horizontal = T, range = 3)

Creating Histogram

data <- df$Age

mean(data)
## [1] NA
median(data)
## [1] NA
mode(data)
## [1] "numeric"
hist(data)

These charts demonstrate how the data is skewed right as the majority of the data is located on the left side of the graph and the mean is greater than the median.