df <- read.csv("C:/Users/HP/Downloads/test2.csv")
## Data has been imported correctly -
# head(df) # first 5 rows of the data
# tail(df) # last 5 rows of the data
# str(df)
glimpse(df) # from tidyverse package
## Rows: 418
## Columns: 11
## $ PassengerId <int> 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903…
## $ Pclass <int> 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 1, 1, 2, 1, 2, 2, 3, 3, 3…
## $ Name <chr> "Kelly, Mr. James", "Wilkes, Mrs. James (Ellen Needs)", "M…
## $ Sex <chr> "male", "female", "male", "male", "female", "male", "femal…
## $ Age <dbl> 34.5, 47.0, 62.0, 27.0, 22.0, 14.0, 30.0, 26.0, 18.0, 21.0…
## $ SibSp <int> 0, 1, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0…
## $ Parch <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Ticket <chr> "330911", "363272", "240276", "315154", "3101298", "7538",…
## $ Fare <dbl> 7.8292, 7.0000, 9.6875, 8.6625, 12.2875, 9.2250, 7.6292, 2…
## $ Cabin <chr> "", "", "", "", "", "", "", "", "", "", "", "", "B45", "",…
## $ Embarked <chr> "Q", "S", "Q", "S", "S", "S", "Q", "S", "C", "S", "S", "S"…
df <- read.csv("C:/Users/HP/Downloads/test2.csv")
table(df$Pclass)
##
## 1 2 3
## 107 93 218
table(df$Sex)
##
## female male
## 152 266
vis_dat(df) # from vis_dat
vis_miss(df) # from vis_dat
missing_values_count <- sapply(df, function(x) sum(is.na(x)))
print(missing_values_count)
## PassengerId Pclass Name Sex Age SibSp
## 0 0 0 0 86 0
## Parch Ticket Fare Cabin Embarked
## 0 0 1 0 0
?na.omit
## starting httpd help server ... done
df_drop <- na.omit(df)
?stargazer
# BASIC COMMAND
## stargazer(df, type = "text") # Age has 332 observations only, while all other. variables have 418 observations.
# EMBELLISHED COMMAND
stargazer(df,
type = "text", # output format - "html"
notes = "N=418, but age has 86 missing values",
summary.stat = c("mean","sd","min", "max"),
digits = 1, # decimal places
title = "Titanic Data Summary Statistics Day3"
)
##
## Titanic Data Summary Statistics Day3
## ========================================
## Statistic Mean St. Dev. Min Max
## ----------------------------------------
## PassengerId 1,100.5 120.8 892 1,309
## Pclass 2.3 0.8 1 3
## Age 30.3 14.2 0.2 76.0
## SibSp 0.4 0.9 0 8
## Parch 0.4 1.0 0 9
## Fare 35.6 55.9 0.0 512.3
## ----------------------------------------
## N=418, but age has 86 missing values
Fare: High standard deviation suggests a broad range of ticket prices. 1st class and 3rd class has more passengers than 2nd class; the standard deviation can become relatively large.
The median age seems to be 30 years old, but missing values may affect the fact number a lot.
?boxplot
# Layout to split the screen
layout(mat = matrix(c(1,2),2,1, byrow=TRUE), height = c(1,8))
# Draw the boxplot and the histogram
par(mar=c(0, 3.1, 1.1, 2.1))
boxplot(df$Age ,
horizontal = TRUE,
ylim = c(0, 80),
xaxt = "n" ,
col = rgb(0.8, 0.8, 0,0.5) ,
frame = F
)
par(mar=c(4, 3.1, 1.1, 2.1))
?hist
hist(df$Age ,
breaks = 10 ,
col = rgb(0.2,0.8,0.5,0.5) ,
border = F ,
main = "" ,
xlab = "Age",
xlim = c(0,80)
)