HW_Day3

df <- read.csv("C:/Users/HP/Downloads/test2.csv")

## Data has been imported correctly -
# head(df) # first 5 rows of the data 
# tail(df) # last 5 rows of the data 
# str(df)

glimpse(df) # from tidyverse package

## Rows: 418
## Columns: 11
## $ PassengerId <int> 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903…
## $ Pclass      <int> 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 1, 1, 2, 1, 2, 2, 3, 3, 3…
## $ Name        <chr> "Kelly, Mr. James", "Wilkes, Mrs. James (Ellen Needs)", "M…
## $ Sex         <chr> "male", "female", "male", "male", "female", "male", "femal…
## $ Age         <dbl> 34.5, 47.0, 62.0, 27.0, 22.0, 14.0, 30.0, 26.0, 18.0, 21.0…
## $ SibSp       <int> 0, 1, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0…
## $ Parch       <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Ticket      <chr> "330911", "363272", "240276", "315154", "3101298", "7538",…
## $ Fare        <dbl> 7.8292, 7.0000, 9.6875, 8.6625, 12.2875, 9.2250, 7.6292, 2…
## $ Cabin       <chr> "", "", "", "", "", "", "", "", "", "", "", "", "B45", "",…
## $ Embarked    <chr> "Q", "S", "Q", "S", "S", "S", "Q", "S", "C", "S", "S", "S"…

df <- read.csv("C:/Users/HP/Downloads/test2.csv")
table(df$Pclass)

## 
##   1   2   3 
## 107  93 218

table(df$Sex)

## 
## female   male 
##    152    266

vis_dat(df)   # from vis_dat

vis_miss(df)  # from vis_dat

missing_values_count <- sapply(df, function(x) sum(is.na(x)))
print(missing_values_count)

## PassengerId      Pclass        Name         Sex         Age       SibSp 
##           0           0           0           0          86           0 
##       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           1           0           0

?na.omit

## starting httpd help server ... done

df_drop    <- na.omit(df)

?stargazer

# BASIC COMMAND
## stargazer(df,  type = "text") # Age has 332 observations only, while all other. variables have 418 observations.

# EMBELLISHED COMMAND
stargazer(df,                  
          type           = "text",                                   # output format - "html"
          notes          = "N=418, but age has 86 missing values", 
          summary.stat   = c("mean","sd","min", "max"), 
          digits         = 1,                                        # decimal places 
          title          = "Titanic Data Summary Statistics Day3"
          )

## 
## Titanic Data Summary Statistics Day3
## ========================================
## Statistic      Mean   St. Dev. Min  Max 
## ----------------------------------------
## PassengerId  1,100.5   120.8   892 1,309
## Pclass         2.3      0.8     1    3  
## Age            30.3     14.2   0.2 76.0 
## SibSp          0.4      0.9     0    8  
## Parch          0.4      1.0     0    9  
## Fare           35.6     55.9   0.0 512.3
## ----------------------------------------
## N=418, but age has 86 missing values

My Two Observation

Fare: High standard deviation suggests a broad range of ticket prices. 1st class and 3rd class has more passengers than 2nd class; the standard deviation can become relatively large.
The median age seems to be 30 years old, but missing values may affect the fact number a lot.

Boxplot

?boxplot
 
# Layout to split the screen

layout(mat = matrix(c(1,2),2,1, byrow=TRUE),  height = c(1,8))
 

# Draw the boxplot and the histogram 
par(mar=c(0, 3.1, 1.1, 2.1))

boxplot(df$Age , 
        horizontal = TRUE,  
        ylim       = c(0, 80), 
        xaxt       = "n" ,
        col        = rgb(0.8, 0.8, 0,0.5) , 
        frame      = F
        )


par(mar=c(4, 3.1, 1.1, 2.1))

?hist
hist(df$Age , 
     breaks  = 10 , 
     col     = rgb(0.2,0.8,0.5,0.5) , 
     border  = F , 
     main    = "" , 
     xlab    = "Age", 
     xlim    = c(0,80)
     )

HW_Day3

CHJ

2024-08-01

My Two Observation

Boxplot