library(readxl)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
dat <- read_excel("set1.xlsx")
dat <- dat %>% filter(Phu_trach == "Khánh")
dat$Pclass <- as.factor(dat$Pclass)
dat$Survival <- factor(dat$Survival, ordered = T, labels = c("No", "Yes"))
dat$Sex <- as.factor(dat$Sex)
dat$Embarked <- as.factor(dat$Embarked)
dat$Age <- as.integer(dat$Age)
describe(dat)
## vars n mean sd median trimmed mad min max range skew
## ID 1 50 298.50 14.58 298.5 298.50 18.53 274.00 323 49.00 0.00
## Survival* 2 50 1.40 0.49 1.0 1.38 0.00 1.00 2 1.00 0.40
## Pclass* 3 50 2.30 0.81 3.0 2.38 0.00 1.00 3 2.00 -0.58
## Name* 4 50 25.50 14.58 25.5 25.50 18.53 1.00 50 49.00 0.00
## Sex* 5 50 1.62 0.49 2.0 1.65 0.00 1.00 2 1.00 -0.48
## Age 6 50 26.80 11.58 25.0 27.05 8.90 1.00 60 59.00 0.01
## SibSp 7 50 0.48 0.97 0.0 0.25 0.00 0.00 5 5.00 2.72
## Parch 8 50 0.42 0.86 0.0 0.23 0.00 0.00 4 4.00 2.13
## Fare 9 50 38.91 58.75 13.0 23.90 7.72 4.01 263 258.99 2.39
## Embarked* 10 50 1.74 0.44 2.0 1.80 0.00 1.00 2 1.00 -1.06
## Phu_trach* 11 50 1.00 0.00 1.0 1.00 0.00 1.00 1 0.00 NaN
## kurtosis se
## ID -1.27 2.06
## Survival* -1.88 0.07
## Pclass* -1.28 0.12
## Name* -1.27 2.06
## Sex* -1.80 0.07
## Age 0.64 1.64
## SibSp 8.25 0.14
## Parch 4.46 0.12
## Fare 5.17 8.31
## Embarked* -0.89 0.06
## Phu_trach* NaN 0.00
dat$discrete_age <- cut_width(dat$Age, 12)
dat <- dat %>% mutate(discrete_age = case_when(
discrete_age == "[-6,6]" ~ "< 6",
discrete_age == "(6,18]" ~ "6 - 18",
discrete_age == "(18,30]" ~ "18 - 30",
discrete_age == "(30,42]" ~ "30 - 42",
discrete_age == "(42,54]" ~ "42 - 54",
discrete_age == "(54,66]" ~ "> 54"
))
dat$discrete_age <- as.factor(dat$discrete_age)
attributes(dat$discrete_age)$levels <- c("< 6", "6 - 18", "18 - 30", "30 - 42", "42 - 54", "> 54")
ggplot(dat, aes(x = discrete_age)) +
geom_histogram(stat = "count", fill = "#124B92") +
theme_minimal() +
labs(x = "Age", y = "Count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
freq_tab <- as.data.frame(table(dat$discrete_age)) %>% left_join(
as.data.frame(prop.table(table(dat$discrete_age))) %>% rename(Prop = Freq)
)
## Joining, by = "Var1"
freq_tab
## Var1 Freq Prop
## 1 < 6 4 0.08
## 2 6 - 18 1 0.02
## 3 18 - 30 26 0.52
## 4 30 - 42 13 0.26
## 5 42 - 54 2 0.04
## 6 > 54 4 0.08
library(ggrepel)
ggplot(freq_tab, aes(x = "", y = Prop, fill = Var1)) +
geom_col() +
coord_polar(theta = "y") +
geom_label(
aes(label = paste(Prop * 100, "%")),
position = position_stack(vjust = 0.5),
show.legend = FALSE
) +
scale_fill_brewer(palette = "Pastel1") +
theme_void() +
guides(
fill = guide_legend(title = "Age", label = T)
)
dat %>%
# mutate(Survival = factor(Survival, labels = c("No", "Yes"))) %>%
group_by(discrete_age, Survival) %>%
summarise(count = n()) %>%
ggplot(aes(discrete_age, count, fill = Survival)) +
geom_bar(stat = "identity", position = 'dodge') +
scale_fill_brewer(palette = "Pastel2") +
theme_minimal() +
labs(x = "Age", y = "Count")
## `summarise()` has grouped output by 'discrete_age'. You can override using the `.groups` argument.
m1 <- lm(Fare ~ Age, data = dat)
summary(m1)
##
## Call:
## lm(formula = Fare ~ Age, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.215 -30.660 -25.958 5.253 225.557
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.8639 21.2254 1.171 0.247
## Age 0.5241 0.7281 0.720 0.475
##
## Residual standard error: 59.04 on 48 degrees of freedom
## Multiple R-squared: 0.01068, Adjusted R-squared: -0.009931
## F-statistic: 0.5181 on 1 and 48 DF, p-value: 0.4751
anova(m1)
## Analysis of Variance Table
##
## Response: Fare
## Df Sum Sq Mean Sq F value Pr(>F)
## Age 1 1806 1806.0 0.5181 0.4751
## Residuals 48 167305 3485.5
ggplot(dat, aes(Age, Fare)) +
geom_point(col = "#777777") +
geom_smooth(method = "lm", se = F) +
theme_minimal()
## `geom_smooth()` using formula 'y ~ x'
dat <- read_excel("set1.xlsx")
dat <- dat %>% filter(Phu_trach == "anh")
dat$Pclass <- as.factor(dat$Pclass)
dat$Survival <- factor(dat$Survival, ordered = T, labels = c("No", "Yes"))
dat$Sex <- as.factor(dat$Sex)
dat$Embarked <- as.factor(dat$Embarked)
dat$Age <- as.integer(dat$Age)
describe(dat)
## vars n mean sd median trimmed mad min max range skew
## ID 1 50 25.50 14.58 25.50 25.50 18.53 1.00 50 49.00 0.00
## Survival* 2 50 1.40 0.49 1.00 1.38 0.00 1.00 2 1.00 0.40
## Pclass* 3 50 2.30 0.84 3.00 2.38 0.00 1.00 3 2.00 -0.59
## Name* 4 50 25.50 14.58 25.50 25.50 18.53 1.00 50 49.00 0.00
## Sex* 5 50 1.50 0.51 1.50 1.50 0.74 1.00 2 1.00 0.00
## Age 6 50 26.88 16.33 26.50 25.77 17.05 2.00 66 64.00 0.50
## SibSp 7 50 0.92 1.23 1.00 0.67 1.48 0.00 5 5.00 1.58
## Parch 8 50 0.56 1.15 0.00 0.30 0.00 0.00 5 5.00 2.49
## Fare 9 50 31.25 39.47 21.04 23.71 18.12 7.23 263 255.77 4.13
## Embarked* 10 50 2.54 0.81 3.00 2.67 0.00 1.00 3 2.00 -1.24
## Phu_trach* 11 50 1.00 0.00 1.00 1.00 0.00 1.00 1 0.00 NaN
## kurtosis se
## ID -1.27 2.06
## Survival* -1.88 0.07
## Pclass* -1.35 0.12
## Name* -1.27 2.06
## Sex* -2.04 0.07
## Age -0.34 2.31
## SibSp 1.86 0.17
## Parch 6.36 0.16
## Fare 21.00 5.58
## Embarked* -0.34 0.12
## Phu_trach* NaN 0.00
dat$discrete_age <- cut_width(dat$Age, 12)
dat <- dat %>% mutate(discrete_age = case_when(
discrete_age == "[-6,6]" ~ "< 6",
discrete_age == "(6,18]" ~ "6 - 18",
discrete_age == "(18,30]" ~ "18 - 30",
discrete_age == "(30,42]" ~ "30 - 42",
discrete_age == "(42,54]" ~ "42 - 54",
discrete_age == "(54,66]" ~ "> 54"
))
dat$discrete_age <- as.factor(dat$discrete_age)
attributes(dat$discrete_age)$levels <- c("< 6", "6 - 18", "18 - 30", "30 - 42", "42 - 54", "> 54")
ggplot(dat, aes(x = discrete_age)) +
geom_histogram(stat = "count", fill = "#124B92") +
theme_minimal() +
labs(x = "Age", y = "Count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
freq_tab <- as.data.frame(table(dat$discrete_age)) %>% left_join(
as.data.frame(prop.table(table(dat$discrete_age))) %>% rename(Prop = Freq)
)
## Joining, by = "Var1"
freq_tab
## Var1 Freq Prop
## 1 < 6 6 0.12
## 2 6 - 18 4 0.08
## 3 18 - 30 17 0.34
## 4 30 - 42 11 0.22
## 5 42 - 54 3 0.06
## 6 > 54 9 0.18
library(ggrepel)
ggplot(freq_tab, aes(x = "", y = Prop, fill = Var1)) +
geom_col() +
coord_polar(theta = "y") +
geom_label(
aes(label = paste(Prop * 100, "%")),
position = position_stack(vjust = 0.5),
show.legend = FALSE
) +
scale_fill_brewer(palette = "Pastel1") +
theme_void() +
guides(
fill = guide_legend(title = "Age", label = T)
)
dat %>%
# mutate(Survival = factor(Survival, labels = c("No", "Yes"))) %>%
group_by(discrete_age, Survival) %>%
summarise(count = n()) %>%
ggplot(aes(discrete_age, count, fill = Survival)) +
geom_bar(stat = "identity", position = 'dodge') +
scale_fill_brewer(palette = "Pastel2") +
theme_minimal() +
labs(x = "Age", y = "Count")
## `summarise()` has grouped output by 'discrete_age'. You can override using the `.groups` argument.
m1 <- lm(Fare ~ Age, data = dat)
summary(m1)
##
## Call:
## lm(formula = Fare ~ Age, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.401 -21.243 -9.594 4.002 233.693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.6203 10.8872 2.261 0.0283 *
## Age 0.2467 0.3471 0.711 0.4807
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.67 on 48 degrees of freedom
## Multiple R-squared: 0.01041, Adjusted R-squared: -0.0102
## F-statistic: 0.505 on 1 and 48 DF, p-value: 0.4807
anova(m1)
## Analysis of Variance Table
##
## Response: Fare
## Df Sum Sq Mean Sq F value Pr(>F)
## Age 1 795 794.77 0.505 0.4807
## Residuals 48 75539 1573.72
ggplot(dat, aes(Age, Fare)) +
geom_point(col = "#777777") +
geom_smooth(method = "lm", se = F) +
theme_minimal()
## `geom_smooth()` using formula 'y ~ x'
dat <- read_excel("set1.xlsx")
dat <- dat %>% filter(Phu_trach == "nam")
dat$Pclass <- as.factor(dat$Pclass)
dat$Survival <- factor(dat$Survival, ordered = T, labels = c("No", "Yes"))
dat$Sex <- as.factor(dat$Sex)
dat$Embarked <- as.factor(dat$Embarked)
dat$Age <- as.integer(dat$Age)
describe(dat)
## vars n mean sd median trimmed mad min max range skew
## ID 1 50 389.50 14.58 389.50 389.50 18.53 365.00 414.00 49.0 0.00
## Survival* 2 50 1.42 0.50 1.00 1.40 0.00 1.00 2.00 1.0 0.31
## Pclass* 3 50 2.16 0.91 2.50 2.20 0.74 1.00 3.00 2.0 -0.31
## Name* 4 50 25.50 14.58 25.50 25.50 18.53 1.00 50.00 49.0 0.00
## Sex* 5 50 1.64 0.48 2.00 1.68 0.00 1.00 2.00 1.0 -0.57
## Age 6 50 33.72 16.00 32.50 33.42 15.57 0.00 71.00 71.0 0.24
## SibSp 7 50 0.36 0.83 0.00 0.20 0.00 0.00 5.00 5.0 3.73
## Parch 8 50 0.22 0.58 0.00 0.05 0.00 0.00 2.00 2.0 2.39
## Fare 9 50 30.47 32.41 17.58 23.80 14.36 7.05 151.55 144.5 1.78
## Embarked* 10 50 2.60 0.78 3.00 2.75 0.00 1.00 3.00 2.0 -1.45
## Phu_trach* 11 50 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.0 NaN
## kurtosis se
## ID -1.27 2.06
## Survival* -1.94 0.07
## Pclass* -1.75 0.13
## Name* -1.27 2.06
## Sex* -1.71 0.07
## Age -0.41 2.26
## SibSp 17.22 0.12
## Parch 4.23 0.08
## Fare 2.74 4.58
## Embarked* 0.21 0.11
## Phu_trach* NaN 0.00
dat$discrete_age <- cut_width(dat$Age, 12)
dat <- dat %>% mutate(discrete_age = case_when(
discrete_age == "[-6,6]" ~ "< 6",
discrete_age == "(6,18]" ~ "6 - 18",
discrete_age == "(18,30]" ~ "18 - 30",
discrete_age == "(30,42]" ~ "30 - 42",
discrete_age == "(42,54]" ~ "42 - 54",
discrete_age == "(54,66]" ~ "> 54"
))
dat$discrete_age <- as.factor(dat$discrete_age)
attributes(dat$discrete_age)$levels <- c("< 6", "6 - 18", "18 - 30", "30 - 42", "42 - 54", "> 54")
ggplot(dat, aes(x = discrete_age)) +
geom_histogram(stat = "count", fill = "#124B92") +
theme_minimal() +
labs(x = "Age", y = "Count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
freq_tab <- as.data.frame(table(dat$discrete_age)) %>% left_join(
as.data.frame(prop.table(table(dat$discrete_age))) %>% rename(Prop = Freq)
)
## Joining, by = "Var1"
freq_tab
## Var1 Freq Prop
## 1 < 6 2 0.04081633
## 2 6 - 18 5 0.10204082
## 3 18 - 30 17 0.34693878
## 4 30 - 42 12 0.24489796
## 5 42 - 54 8 0.16326531
## 6 > 54 5 0.10204082
library(ggrepel)
ggplot(freq_tab, aes(x = "", y = Prop, fill = Var1)) +
geom_col() +
coord_polar(theta = "y") +
geom_label(
aes(label = paste(round(Prop * 100, 2), "%")),
position = position_stack(vjust = 0.5),
show.legend = FALSE
) +
scale_fill_brewer(palette = "Pastel1") +
theme_void() +
guides(
fill = guide_legend(title = "Age", label = T)
)
dat %>%
# mutate(Survival = factor(Survival, labels = c("No", "Yes"))) %>%
group_by(discrete_age, Survival) %>%
summarise(count = n()) %>%
ggplot(aes(discrete_age, count, fill = Survival)) +
geom_bar(stat = "identity", position = 'dodge') +
scale_fill_brewer(palette = "Pastel2") +
theme_minimal() +
labs(x = "Age", y = "Count")
## `summarise()` has grouped output by 'discrete_age'. You can override using the `.groups` argument.
m1 <- lm(Fare ~ Age, data = dat)
summary(m1)
##
## Call:
## lm(formula = Fare ~ Age, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.460 -22.378 -13.137 2.807 121.010
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.754592 10.891043 2.824 0.00689 **
## Age -0.008586 0.292327 -0.029 0.97669
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.75 on 48 degrees of freedom
## Multiple R-squared: 1.797e-05, Adjusted R-squared: -0.02081
## F-statistic: 0.0008626 on 1 and 48 DF, p-value: 0.9767
anova(m1)
## Analysis of Variance Table
##
## Response: Fare
## Df Sum Sq Mean Sq F value Pr(>F)
## Age 1 1 0.93 9e-04 0.9767
## Residuals 48 51478 1072.47
ggplot(dat, aes(Age, Fare)) +
geom_point(col = "#777777") +
geom_smooth(method = "lm", se = F) +
theme_minimal()
## `geom_smooth()` using formula 'y ~ x'