read this help file Help

My first pic:

1 Setup:

setwd("C:/Users/Alex Law/Documents/BC Data Analysis")

install all libraries: make sure you have the packages

#install.packages("tidyverse")
#install.packages("visdat")
#install.packages("stargazer")

library("psych")
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("visdat") #for visualizing the data
library("stargazer")
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

1.1 Set my working directory

2 import Data:

now I will import my data, make sure you comment out

df<- read.csv("train (1).csv")
head(df)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                              Heikkinen, Miss. Laina female  26     0     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5                            Allen, Mr. William Henry   male  35     0     0
## 6                                    Moran, Mr. James   male  NA     0     0
##             Ticket    Fare Cabin Embarked
## 1        A/5 21171  7.2500              S
## 2         PC 17599 71.2833   C85        C
## 3 STON/O2. 3101282  7.9250              S
## 4           113803 53.1000  C123        S
## 5           373450  8.0500              S
## 6           330877  8.4583              Q
tail(df)
##     PassengerId Survived Pclass                                     Name    Sex
## 886         886        0      3     Rice, Mrs. William (Margaret Norton) female
## 887         887        0      2                    Montvila, Rev. Juozas   male
## 888         888        1      1             Graham, Miss. Margaret Edith female
## 889         889        0      3 Johnston, Miss. Catherine Helen "Carrie" female
## 890         890        1      1                    Behr, Mr. Karl Howell   male
## 891         891        0      3                      Dooley, Mr. Patrick   male
##     Age SibSp Parch     Ticket   Fare Cabin Embarked
## 886  39     0     5     382652 29.125              Q
## 887  27     0     0     211536 13.000              S
## 888  19     0     0     112053 30.000   B42        S
## 889  NA     1     2 W./C. 6607 23.450              S
## 890  26     0     0     111369 30.000  C148        C
## 891  32     0     0     370376  7.750              Q
str(df)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...

3 Summary statistics:

here is my table from stargazer

  1. Split the arguments into different lines within a function

  2. Align your code equal to(=)

df_drop <- na.omit(df)
#embellished command 
stargazer(df, 
          type = "text",            #controls HTML 
          notes = "N=891, but age has 177 missing values", 
          summary.stat = c("mean","sd", "min", "max"),
          title = "Titanic Stats",
          digits = 1                #controls decimal places 
          )
## 
## Titanic Stats
## =========================================
## Statistic      Mean   St. Dev. Min   Max 
## -----------------------------------------
## PassengerId    446.0   257.4    1    891 
## Survived        0.4     0.5     0     1  
## Pclass          2.3     0.8     1     3  
## Age            29.7     14.5   0.4  80.0 
## SibSp           0.5     1.1     0     8  
## Parch           0.4     0.8     0     6  
## Fare           32.2     49.7   0.0  512.3
## -----------------------------------------
## N=891, but age has 177 missing values

4 Visualization

Age missing data is 1.7% of the dataset

vis_dat(df)

Read up on indexing:

Variable Type Level of Measurement
Passenger ID Qualitative Nominal
Survived Qualitiative Nominal
Passenger Class: Qualitative Ordinal
Name Qualitative Nominal
Sex Qualititative Nominal
Age Quantitative Ratio
Number of siblings/ spouses aboard Quantitative Ratio
Number of parents/ children aboard Quantitative Ratio
Ticket Qualitative Nominal
Fare Quantitative Ratio
Cabin Qualitative Nominal
Embark Qualitative Nominal

5 Treating Missing Data

I cna drop all observations corresponding to missing age values

{r- drop missing values} df_drop <- na.omit(df)

#replace missing values of age with the median 
########STEP 1 FIND THE MEAN
mean(df$Age, na.rm = TRUE)
## [1] 29.69912
########STEP 2 DUPLICATE THE ORIGINAL DATA
df_imputed <- df 

#######STEP 3 CHANGE THE VARIABLE VALUES 
#describe(df_imputed$Age) #mean before and after imputing 
#df_imputed$Age[is.na(df$Age)] <- mean(df$Age, na.rm = TRUE)
#describe[is.na(df_imputed$Age)

6 Homework:

test <- read.csv("train (1).csv")
df_clean <- na.omit(test)
vis_dat(df_clean)

stargazer(df_clean, type = "text")
## 
## ==============================================
## Statistic    N   Mean   St. Dev.  Min    Max  
## ----------------------------------------------
## PassengerId 714 448.583 259.120    1     891  
## Survived    714  0.406   0.491     0      1   
## Pclass      714  2.237   0.838     1      3   
## Age         714 29.699   14.526  0.420 80.000 
## SibSp       714  0.513   0.930     0      5   
## Parch       714  0.431   0.853     0      6   
## Fare        714 34.695   52.919  0.000 512.329
## ----------------------------------------------
boxplot(df_clean$Age , 
        horizontal = TRUE,  
        ylim       = c(0, 100), 
        xaxt       = "n" ,
        col        = rgb(0.9, 0.5, 0,0.3) , 
        frame      = F
        )

hist(df_clean$Age , 
     breaks  = 10 , 
     col     = rgb(0.2,0.8,0.5,0.5) , 
     border  = F , 
     main    = "" , 
     xlab    = "Age", 
     xlim    = c(0,100)
     )